mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix [NEX-49] : Fix TypeError for empty HTML content (#4032)
### Summary Addressed a TypeError that occurred when partitioning empty or whitespace-only HTML content. ## Test * unit test `test_unstructured/partition/html/test_partition.py::test_partition_html_with_empty_content_raises_error` can reproduce the TypeErro before fix * now test can pass
This commit is contained in:
parent
3f87946f56
commit
c04235c168
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.18.2-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
|
||||
|
||||
|
||||
## 0.18.1
|
||||
|
||||
### Enhancements
|
||||
|
@ -5,7 +5,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
@ -1477,3 +1479,40 @@ class Describe_HtmlPartitioner:
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
|
||||
assert list(_HtmlPartitioner.iter_elements(opts)) == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("test_case", "content"),
|
||||
[
|
||||
("empty_file", ""),
|
||||
("empty_bytes", b""),
|
||||
("whitespace_only", " \n\t \n "),
|
||||
],
|
||||
)
|
||||
def test_partition_html_with_empty_content_raises_error(test_case, content):
|
||||
"""Test that partitioning empty/whitespace-only HTML content won't
|
||||
raise TypeError: Invalid input object: NoneType.
|
||||
|
||||
This reproduces the production error where lxml.etree.strip_elements is called with None
|
||||
when the HTML content is empty, causing etree.fromstring to return None.
|
||||
|
||||
Args:
|
||||
test_case: Description of the test scenario
|
||||
content: The content to test (empty string, empty bytes, or whitespace)
|
||||
"""
|
||||
if test_case == "empty_bytes":
|
||||
# Create a file-like object with empty content
|
||||
empty_file = io.BytesIO(content)
|
||||
partition_html(file=empty_file)
|
||||
else:
|
||||
# Create a temporary file with the given content
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f:
|
||||
f.write(content)
|
||||
f.flush()
|
||||
temp_filename = f.name
|
||||
|
||||
try:
|
||||
elements = partition_html(filename=temp_filename)
|
||||
assert len(elements) == 0
|
||||
finally:
|
||||
os.unlink(temp_filename)
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.18.1" # pragma: no cover
|
||||
__version__ = "0.18.2-dev0" # pragma: no cover
|
||||
|
@ -210,6 +210,11 @@ class _HtmlPartitioner:
|
||||
|
||||
Elements appear in document order.
|
||||
"""
|
||||
# -- handle empty or whitespace-only HTML content --
|
||||
html_text = self._opts.html_text
|
||||
if not html_text or html_text.strip() == "":
|
||||
return
|
||||
|
||||
elements_iter = (
|
||||
self._main.iter_elements()
|
||||
if self._opts.html_parser_version == "v1"
|
||||
@ -264,6 +269,11 @@ class _HtmlPartitioner:
|
||||
def _from_ontology(self) -> List[Element]:
|
||||
"""Convert an ontology elements represented in HTML to an ontology element."""
|
||||
html_text = self._opts.html_text
|
||||
|
||||
# -- handle empty or whitespace-only HTML content --
|
||||
if not html_text or html_text.strip() == "":
|
||||
return []
|
||||
|
||||
ontology = parse_html_to_ontology(html_text)
|
||||
unstructured_elements = ontology_to_unstructured_elements(
|
||||
ontology, add_img_alt_text=self._opts.add_img_alt_text
|
||||
|
Loading…
x
Reference in New Issue
Block a user