mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Fix: partition on empty or whitespace-only text files (#3675)
This is a fix for this [bug](https://github.com/Unstructured-IO/unstructured/issues/3674), auto partition fails on text files which are empty or contain only whitespaces Inference of .txt file type fails if the file has only whitespaces. To Reproduce: ``` from tempfile import NamedTemporaryFile from unstructured.partition.auto import partition with NamedTemporaryFile(mode="w", suffix=".txt") as f: f.write(" \n") f.seek(0) elements = partition(filename=f.name) ```
This commit is contained in:
parent
50d75c47d3
commit
75c4998bc7
@ -1,4 +1,4 @@
|
||||
## 0.15.14-dev4
|
||||
## 0.15.14-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
* **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK.
|
||||
* **Remove "unused" `date_from_file_object` parameter.** As part of simplifying partitioning parameter set, remove `date_from_file_object` parameter. A file object does not have a last-modified date attribute so can never give a useful value. When a file-object is used as the document source (such as in Unstructured API) the last-modified date must come from the `metadata_last_modified` argument.
|
||||
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
|
||||
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
|
||||
|
||||
## 0.15.13
|
||||
|
||||
|
3
example-docs/fake-text-all-whitespace.txt
Normal file
3
example-docs/fake-text-all-whitespace.txt
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
|
||||
|
@ -749,22 +749,30 @@ def test_auto_partition_tsv_from_filename():
|
||||
# ================================================================================================
|
||||
# TXT
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_auto_partition_text_from_filename():
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
|
||||
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
||||
|
||||
assert elements == [
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "expected_elements"),
|
||||
[
|
||||
(
|
||||
"fake-text.txt",
|
||||
[
|
||||
NarrativeText(text="This is a test document to use for unit tests."),
|
||||
Address(text="Doylestown, PA 18901"),
|
||||
Title(text="Important points:"),
|
||||
ListItem(text="Hamburgers are delicious"),
|
||||
ListItem(text="Dogs are the best"),
|
||||
ListItem(text="I love fuzzy blankets"),
|
||||
]
|
||||
assert all(e.metadata.filename == "fake-text.txt" for e in elements)
|
||||
],
|
||||
),
|
||||
("fake-text-all-whitespace.txt", []),
|
||||
],
|
||||
)
|
||||
def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
|
||||
file_path = example_doc_path(filename)
|
||||
|
||||
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
||||
|
||||
assert elements == expected_elements
|
||||
assert all(e.metadata.filename == filename for e in elements)
|
||||
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
||||
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.14-dev4" # pragma: no cover
|
||||
__version__ = "0.15.14-dev5" # pragma: no cover
|
||||
|
@ -601,7 +601,7 @@ class _TextFileDifferentiator:
|
||||
text_head = self._ctx.text_head
|
||||
|
||||
# -- an empty file is not JSON --
|
||||
if not text_head:
|
||||
if not text_head.lstrip():
|
||||
return False
|
||||
|
||||
# -- has to be a list or object, no string, number, or bool --
|
||||
|
Loading…
x
Reference in New Issue
Block a user