mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 15:11:30 +00:00
fix [NEX-49] : Fix TypeError for empty HTML content (#4032)
### Summary Addressed a TypeError that occurred when partitioning empty or whitespace-only HTML content. ## Test * unit test `test_unstructured/partition/html/test_partition.py::test_partition_html_with_empty_content_raises_error` can reproduce the TypeErro before fix * now test can pass
This commit is contained in:
parent
3f87946f56
commit
c04235c168
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.18.2-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
|
||||||
|
|
||||||
|
|
||||||
## 0.18.1
|
## 0.18.1
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -5,7 +5,9 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import tempfile
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -1477,3 +1479,40 @@ class Describe_HtmlPartitioner:
|
|||||||
opts = HtmlPartitionerOptions(**opts_args)
|
opts = HtmlPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
assert list(_HtmlPartitioner.iter_elements(opts)) == []
|
assert list(_HtmlPartitioner.iter_elements(opts)) == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("test_case", "content"),
|
||||||
|
[
|
||||||
|
("empty_file", ""),
|
||||||
|
("empty_bytes", b""),
|
||||||
|
("whitespace_only", " \n\t \n "),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_html_with_empty_content_raises_error(test_case, content):
|
||||||
|
"""Test that partitioning empty/whitespace-only HTML content won't
|
||||||
|
raise TypeError: Invalid input object: NoneType.
|
||||||
|
|
||||||
|
This reproduces the production error where lxml.etree.strip_elements is called with None
|
||||||
|
when the HTML content is empty, causing etree.fromstring to return None.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_case: Description of the test scenario
|
||||||
|
content: The content to test (empty string, empty bytes, or whitespace)
|
||||||
|
"""
|
||||||
|
if test_case == "empty_bytes":
|
||||||
|
# Create a file-like object with empty content
|
||||||
|
empty_file = io.BytesIO(content)
|
||||||
|
partition_html(file=empty_file)
|
||||||
|
else:
|
||||||
|
# Create a temporary file with the given content
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f:
|
||||||
|
f.write(content)
|
||||||
|
f.flush()
|
||||||
|
temp_filename = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
elements = partition_html(filename=temp_filename)
|
||||||
|
assert len(elements) == 0
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_filename)
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.18.1" # pragma: no cover
|
__version__ = "0.18.2-dev0" # pragma: no cover
|
||||||
|
@ -210,6 +210,11 @@ class _HtmlPartitioner:
|
|||||||
|
|
||||||
Elements appear in document order.
|
Elements appear in document order.
|
||||||
"""
|
"""
|
||||||
|
# -- handle empty or whitespace-only HTML content --
|
||||||
|
html_text = self._opts.html_text
|
||||||
|
if not html_text or html_text.strip() == "":
|
||||||
|
return
|
||||||
|
|
||||||
elements_iter = (
|
elements_iter = (
|
||||||
self._main.iter_elements()
|
self._main.iter_elements()
|
||||||
if self._opts.html_parser_version == "v1"
|
if self._opts.html_parser_version == "v1"
|
||||||
@ -264,6 +269,11 @@ class _HtmlPartitioner:
|
|||||||
def _from_ontology(self) -> List[Element]:
|
def _from_ontology(self) -> List[Element]:
|
||||||
"""Convert an ontology elements represented in HTML to an ontology element."""
|
"""Convert an ontology elements represented in HTML to an ontology element."""
|
||||||
html_text = self._opts.html_text
|
html_text = self._opts.html_text
|
||||||
|
|
||||||
|
# -- handle empty or whitespace-only HTML content --
|
||||||
|
if not html_text or html_text.strip() == "":
|
||||||
|
return []
|
||||||
|
|
||||||
ontology = parse_html_to_ontology(html_text)
|
ontology = parse_html_to_ontology(html_text)
|
||||||
unstructured_elements = ontology_to_unstructured_elements(
|
unstructured_elements = ontology_to_unstructured_elements(
|
||||||
ontology, add_img_alt_text=self._opts.add_img_alt_text
|
ontology, add_img_alt_text=self._opts.add_img_alt_text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user