fix: text kwargs no longer fail with empty string (#413)

* fix: text kwargs no longer fail with empty string

* linting
This commit is contained in:
Matt Robinson 2023-03-28 17:03:51 -04:00 committed by GitHub
parent 75cf233702
commit 09b52b4fc4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 37 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.5.8-dev0 ## 0.5.8-dev1
### Enhancements ### Enhancements
@ -11,6 +11,9 @@
### Fixes ### Fixes
* `convert_file_to_text` now passes through the `source_format` and `target_format` kwargs. * `convert_file_to_text` now passes through the `source_format` and `target_format` kwargs.
* Partitioning functions that accept a `text` kwarg no longer raise an error if an empty
string is passed (and empty list of elements is returned instead).
* `partition_json` no longer fails if the input is an empty list.
## 0.5.7 ## 0.5.7

View File

@ -140,6 +140,10 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text_work_with_empty_string():
assert partition_email(text="") == []
def test_partition_email_from_filename_with_embedded_image(): def test_partition_email_from_filename_with_embedded_image():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
elements = partition_email(filename=filename, content_source="text/plain") elements = partition_email(filename=filename, content_source="text/plain")

View File

@ -41,6 +41,10 @@ def test_partition_html_from_text():
assert len(elements) > 0 assert len(elements) > 0
def test_partition_html_from_text_works_with_empty_string():
assert partition_html(text="") == []
class MockResponse: class MockResponse:
def __init__(self, text, status_code, headers={}): def __init__(self, text, status_code, headers={}):
self.text = text self.text = text

View File

@ -87,6 +87,14 @@ def test_partition_json_raises_with_none_specified():
partition_json() partition_json()
def test_partition_json_works_with_empty_string():
assert partition_json(text="") == []
def test_partition_json_works_with_empty_list():
assert partition_json(text="[]") == []
def test_partition_json_raises_with_too_many_specified(): def test_partition_json_raises_with_too_many_specified():
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition(filename=path) elements = partition(filename=path)

View File

@ -59,6 +59,10 @@ def test_partition_text_from_text():
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_text_from_text_works_with_empty_string():
assert partition_text(text="") == []
def test_partition_text_raises_with_none_specified(): def test_partition_text_raises_with_none_specified():
with pytest.raises(ValueError): with pytest.raises(ValueError):
partition_text() partition_text()

View File

@ -1 +1 @@
__version__ = "0.5.8-dev0" # pragma: no cover __version__ = "0.5.8-dev1" # pragma: no cover

View File

@ -168,6 +168,9 @@ def partition_email(
f"Valid content sources are: {VALID_CONTENT_SOURCES}", f"Valid content sources are: {VALID_CONTENT_SOURCES}",
) )
if text is not None and text.strip() == "" and not file and not filename:
return []
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text) exactly_one(filename=filename, file=file, text=text)

View File

@ -45,6 +45,9 @@ def partition_html(
parser parser
The parser to use for parsing the HTML document. If None, default parser will be used. The parser to use for parsing the HTML document. If None, default parser will be used.
""" """
if text is not None and text.strip() == "" and not file and not filename and not url:
return []
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text, url=url) exactly_one(filename=filename, file=file, text=text, url=url)

View File

@ -6,7 +6,7 @@ from unstructured.documents.elements import Element
from unstructured.partition.common import exactly_one from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements from unstructured.staging.base import dict_to_elements
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{" LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
def partition_json( def partition_json(
@ -15,6 +15,9 @@ def partition_json(
text: Optional[str] = None, text: Optional[str] = None,
) -> List[Element]: ) -> List[Element]:
"""Partitions an .json document into its constituent elements.""" """Partitions an .json document into its constituent elements."""
if text is not None and text.strip() == "" and not file and not filename:
return []
exactly_one(filenmae=filename, file=file, text=text) exactly_one(filenmae=filename, file=file, text=text)
if filename is not None: if filename is not None:

View File

@ -43,6 +43,8 @@ def partition_text(
encoding encoding
The encoding method used to decode the text input. If None, utf-8 will be used. The encoding method used to decode the text input. If None, utf-8 will be used.
""" """
if text is not None and text.strip() == "" and not file and not filename:
return []
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text) exactly_one(filename=filename, file=file, text=text)