fix: text kwargs no longer fail with empty string (#413)

* fix: text kwargs no longer fail with empty string

* linting
This commit is contained in:
Matt Robinson 2023-03-28 17:03:51 -04:00 committed by GitHub
parent 75cf233702
commit 09b52b4fc4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 37 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.5.8-dev0
## 0.5.8-dev1
### Enhancements
@ -11,6 +11,9 @@
### Fixes
* `convert_file_to_text` now passes through the `source_format` and `target_format` kwargs.
* Partitioning functions that accept a `text` kwarg no longer raise an error if an empty
string is passed (and empty list of elements is returned instead).
* `partition_json` no longer fails if the input is an empty list.
## 0.5.7

View File

@ -140,6 +140,10 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text_work_with_empty_string():
assert partition_email(text="") == []
def test_partition_email_from_filename_with_embedded_image():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
elements = partition_email(filename=filename, content_source="text/plain")

View File

@ -41,6 +41,10 @@ def test_partition_html_from_text():
assert len(elements) > 0
def test_partition_html_from_text_works_with_empty_string():
assert partition_html(text="") == []
class MockResponse:
def __init__(self, text, status_code, headers={}):
self.text = text

View File

@ -87,6 +87,14 @@ def test_partition_json_raises_with_none_specified():
partition_json()
def test_partition_json_works_with_empty_string():
assert partition_json(text="") == []
def test_partition_json_works_with_empty_list():
assert partition_json(text="[]") == []
def test_partition_json_raises_with_too_many_specified():
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition(filename=path)

View File

@ -59,6 +59,10 @@ def test_partition_text_from_text():
assert elements == EXPECTED_OUTPUT
def test_partition_text_from_text_works_with_empty_string():
assert partition_text(text="") == []
def test_partition_text_raises_with_none_specified():
with pytest.raises(ValueError):
partition_text()

View File

@ -1 +1 @@
__version__ = "0.5.8-dev0" # pragma: no cover
__version__ = "0.5.8-dev1" # pragma: no cover

View File

@ -168,6 +168,9 @@ def partition_email(
f"Valid content sources are: {VALID_CONTENT_SOURCES}",
)
if text is not None and text.strip() == "" and not file and not filename:
return []
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)

View File

@ -45,6 +45,9 @@ def partition_html(
parser
The parser to use for parsing the HTML document. If None, default parser will be used.
"""
if text is not None and text.strip() == "" and not file and not filename and not url:
return []
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text, url=url)

View File

@ -6,7 +6,7 @@ from unstructured.documents.elements import Element
from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{"
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
def partition_json(
@ -15,6 +15,9 @@ def partition_json(
text: Optional[str] = None,
) -> List[Element]:
"""Partitions an .json document into its constituent elements."""
if text is not None and text.strip() == "" and not file and not filename:
return []
exactly_one(filenmae=filename, file=file, text=text)
if filename is not None:

View File

@ -43,6 +43,8 @@ def partition_text(
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
"""
if text is not None and text.strip() == "" and not file and not filename:
return []
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)