fix [NEX-49] : Fix TypeError for empty HTML content (#4032)

### Summary

Addressed a TypeError that occurred when partitioning empty or
whitespace-only HTML content.

## Test
* unit test
`test_unstructured/partition/html/test_partition.py::test_partition_html_with_empty_content_raises_error`
can reproduce the TypeErro before fix
* now test can pass
This commit is contained in:
Yuming Long 2025-06-25 11:13:20 -07:00 committed by GitHub
parent 3f87946f56
commit c04235c168
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 60 additions and 1 deletions

View File

@ -1,3 +1,13 @@
## 0.18.2-dev0
### Enhancements
### Features
### Fixes
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
## 0.18.1
### Enhancements

View File

@ -5,7 +5,9 @@
from __future__ import annotations
import io
import os
import pathlib
import tempfile
from typing import Any, Optional
import pytest
@ -1477,3 +1479,40 @@ class Describe_HtmlPartitioner:
opts = HtmlPartitionerOptions(**opts_args)
assert list(_HtmlPartitioner.iter_elements(opts)) == []
@pytest.mark.parametrize(
("test_case", "content"),
[
("empty_file", ""),
("empty_bytes", b""),
("whitespace_only", " \n\t \n "),
],
)
def test_partition_html_with_empty_content_raises_error(test_case, content):
"""Test that partitioning empty/whitespace-only HTML content won't
raise TypeError: Invalid input object: NoneType.
This reproduces the production error where lxml.etree.strip_elements is called with None
when the HTML content is empty, causing etree.fromstring to return None.
Args:
test_case: Description of the test scenario
content: The content to test (empty string, empty bytes, or whitespace)
"""
if test_case == "empty_bytes":
# Create a file-like object with empty content
empty_file = io.BytesIO(content)
partition_html(file=empty_file)
else:
# Create a temporary file with the given content
with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f:
f.write(content)
f.flush()
temp_filename = f.name
try:
elements = partition_html(filename=temp_filename)
assert len(elements) == 0
finally:
os.unlink(temp_filename)

View File

@ -1 +1 @@
__version__ = "0.18.1" # pragma: no cover
__version__ = "0.18.2-dev0" # pragma: no cover

View File

@ -210,6 +210,11 @@ class _HtmlPartitioner:
Elements appear in document order.
"""
# -- handle empty or whitespace-only HTML content --
html_text = self._opts.html_text
if not html_text or html_text.strip() == "":
return
elements_iter = (
self._main.iter_elements()
if self._opts.html_parser_version == "v1"
@ -264,6 +269,11 @@ class _HtmlPartitioner:
def _from_ontology(self) -> List[Element]:
"""Convert an ontology elements represented in HTML to an ontology element."""
html_text = self._opts.html_text
# -- handle empty or whitespace-only HTML content --
if not html_text or html_text.strip() == "":
return []
ontology = parse_html_to_ontology(html_text)
unstructured_elements = ontology_to_unstructured_elements(
ontology, add_img_alt_text=self._opts.add_img_alt_text