diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c0a97cd2..5b8ab2960 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.8-dev0 +## 0.5.8-dev1 ### Enhancements @@ -11,6 +11,9 @@ ### Fixes * `convert_file_to_text` now passes through the `source_format` and `target_format` kwargs. +* Partitioning functions that accept a `text` kwarg no longer raise an error if an empty + string is passed (and empty list of elements is returned instead). +* `partition_json` no longer fails if the input is an empty list. ## 0.5.7 diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 60e8ff7b6..3032c24ce 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -140,6 +140,10 @@ def test_partition_email_from_text(): assert elements == EXPECTED_OUTPUT +def test_partition_email_from_text_work_with_empty_string(): + assert partition_email(text="") == [] + + def test_partition_email_from_filename_with_embedded_image(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml") elements = partition_email(filename=filename, content_source="text/plain") diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 1d3e8666d..6b6c39146 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -41,6 +41,10 @@ def test_partition_html_from_text(): assert len(elements) > 0 +def test_partition_html_from_text_works_with_empty_string(): + assert partition_html(text="") == [] + + class MockResponse: def __init__(self, text, status_code, headers={}): self.text = text diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 84d9f2c2e..7a7625ec6 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -87,6 +87,14 @@ def test_partition_json_raises_with_none_specified(): partition_json() +def test_partition_json_works_with_empty_string(): + assert partition_json(text="") == [] + + +def test_partition_json_works_with_empty_list(): + assert partition_json(text="[]") == [] + + def test_partition_json_raises_with_too_many_specified(): path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") elements = partition(filename=path) diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index 8529a3616..d37bc8ed9 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -59,6 +59,10 @@ def test_partition_text_from_text(): assert elements == EXPECTED_OUTPUT +def test_partition_text_from_text_works_with_empty_string(): + assert partition_text(text="") == [] + + def test_partition_text_raises_with_none_specified(): with pytest.raises(ValueError): partition_text() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 111af2ce6..77a2b5e47 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.8-dev0" # pragma: no cover +__version__ = "0.5.8-dev1" # pragma: no cover diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index eebdcfbf4..86f042d5c 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -168,6 +168,9 @@ def partition_email( f"Valid content sources are: {VALID_CONTENT_SOURCES}", ) + if text is not None and text.strip() == "" and not file and not filename: + return [] + # Verify that only one of the arguments was provided exactly_one(filename=filename, file=file, text=text) diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 7783713ff..cffe28320 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -45,6 +45,9 @@ def partition_html( parser The parser to use for parsing the HTML document. If None, default parser will be used. """ + if text is not None and text.strip() == "" and not file and not filename and not url: + return [] + # Verify that only one of the arguments was provided exactly_one(filename=filename, file=file, text=text, url=url) diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index e089de1b6..e0272c6bc 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -6,7 +6,7 @@ from unstructured.documents.elements import Element from unstructured.partition.common import exactly_one from unstructured.staging.base import dict_to_elements -LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{" +LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?" def partition_json( @@ -15,6 +15,9 @@ def partition_json( text: Optional[str] = None, ) -> List[Element]: """Partitions an .json document into its constituent elements.""" + if text is not None and text.strip() == "" and not file and not filename: + return [] + exactly_one(filenmae=filename, file=file, text=text) if filename is not None: diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index be74a4387..7f11c3c74 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -43,6 +43,8 @@ def partition_text( encoding The encoding method used to decode the text input. If None, utf-8 will be used. """ + if text is not None and text.strip() == "" and not file and not filename: + return [] # Verify that only one of the arguments was provided exactly_one(filename=filename, file=file, text=text)