mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-30 01:17:43 +00:00
image alt support (#3797)
This commit is contained in:
parent
626f73af5b
commit
e48d79eca1
@ -1,3 +1,12 @@
|
||||
## 0.16.7
|
||||
|
||||
### Enhancements
|
||||
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.16.6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
|
||||
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
|
||||
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
|
||||
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
|
||||
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
|
||||
</header>
|
||||
</div>
|
||||
</body>
|
||||
@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
|
||||
[
|
||||
("html_files/example.html", "unstructured_json_output/example.json"),
|
||||
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
|
||||
(
|
||||
"html_files/example_with_alternative_text.html",
|
||||
"unstructured_json_output/example_with_alternative_text.json",
|
||||
),
|
||||
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
|
||||
(
|
||||
"html_files/example_with_inline_fields.html",
|
||||
@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
|
||||
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
|
||||
html_file_path = Path(__file__).parent / html_file_path
|
||||
json_file_path = Path(__file__).parent / json_file_path
|
||||
|
||||
expected_json_elements = elements_from_json(str(json_file_path))
|
||||
html_code = html_file_path.read_text()
|
||||
|
||||
predicted_elements = partition_html(
|
||||
text=html_code, html_parser_version="v2", unique_element_ids=True
|
||||
)
|
||||
|
||||
assert len(expected_json_elements) == len(predicted_elements)
|
||||
|
||||
for i in range(len(expected_json_elements)):
|
||||
|
||||
@ -0,0 +1,62 @@
|
||||
[
|
||||
{
|
||||
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
|
||||
},
|
||||
"text": "New York logo",
|
||||
"type": "Image"
|
||||
},
|
||||
{
|
||||
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
|
||||
},
|
||||
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
|
||||
"type": "Image"
|
||||
}
|
||||
]
|
||||
@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
|
||||
assert len(unstructured_elements) == 2
|
||||
assert isinstance(unstructured_elements[0], Text)
|
||||
assert isinstance(unstructured_elements[1], NarrativeText)
|
||||
|
||||
|
||||
def test_alternate_text_from_image_is_passed():
|
||||
# language=HTML
|
||||
input_html = """
|
||||
<div class="Page">
|
||||
<table>
|
||||
<tr>
|
||||
<td rowspan="2">Example image nested in the table:</td>
|
||||
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>add_img_alt_text
|
||||
"""
|
||||
page = parse_html_to_ontology(input_html)
|
||||
unstructured_elements = ontology_to_unstructured_elements(page)
|
||||
assert len(unstructured_elements) == 2
|
||||
assert "ALT TEXT Logo" in unstructured_elements[1].text
|
||||
|
||||
47
test_unstructured/partition/html/test_partition_v2.py
Normal file
47
test_unstructured/partition/html/test_partition_v2.py
Normal file
@ -0,0 +1,47 @@
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
|
||||
def test_alternative_image_text_can_be_included():
|
||||
# language=HTML
|
||||
html = """
|
||||
<div class="Page">
|
||||
<img src="my-logo.png" alt="ALT TEXT Logo"/>
|
||||
</div>
|
||||
"""
|
||||
_, image_to_text_alt_mode = partition_html(
|
||||
text=html,
|
||||
image_alt_mode="to_text",
|
||||
html_parser_version="v2",
|
||||
)
|
||||
assert "ALT TEXT Logo" in image_to_text_alt_mode.text
|
||||
|
||||
_, image_none_alt_mode = partition_html(
|
||||
text=html,
|
||||
image_alt_mode=None,
|
||||
html_parser_version="v2",
|
||||
)
|
||||
assert "ALT TEXT Logo" not in image_none_alt_mode.text
|
||||
|
||||
|
||||
def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
|
||||
# language=HTML
|
||||
html = """
|
||||
<div class="Page">
|
||||
<p class="Paragraph">
|
||||
<img src="my-logo.png" alt="ALT TEXT Logo"/>
|
||||
</p>
|
||||
</div>
|
||||
"""
|
||||
_, paragraph_to_text_alt_mode = partition_html(
|
||||
text=html,
|
||||
image_alt_mode="to_text",
|
||||
html_parser_version="v2",
|
||||
)
|
||||
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
|
||||
|
||||
_, paragraph_none_alt_mode = partition_html(
|
||||
text=html,
|
||||
image_alt_mode=None,
|
||||
html_parser_version="v2",
|
||||
)
|
||||
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.6" # pragma: no cover
|
||||
__version__ = "0.16.7" # pragma: no cover
|
||||
|
||||
@ -89,11 +89,27 @@ class OntologyElement(BaseModel):
|
||||
|
||||
return result_html
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
|
||||
"""
|
||||
Returns the text representation of the element.
|
||||
|
||||
Args:
|
||||
add_children: If True, the text of the children will be included.
|
||||
Otherwise, element is represented as single self-closing tag.
|
||||
add_img_alt_text: If True, the alt text of the image will be included.
|
||||
"""
|
||||
if self.children and add_children:
|
||||
children_text = " ".join(child.to_text().strip() for child in self.children)
|
||||
children_text = " ".join(
|
||||
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
|
||||
)
|
||||
return children_text
|
||||
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
|
||||
|
||||
text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
|
||||
|
||||
if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
|
||||
text += f" {self.additional_attributes.get('alt', '')}"
|
||||
|
||||
return text.strip()
|
||||
|
||||
def _construct_attribute_string(self, attributes: dict) -> str:
|
||||
return " ".join(
|
||||
@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
||||
allowed_tags: List[str] = Field(["input"], frozen=True)
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
text = super().to_text()
|
||||
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
|
||||
text = super().to_text(add_children, add_img_alt_text)
|
||||
value = self.additional_attributes.get("value", "")
|
||||
if not value:
|
||||
return text
|
||||
|
||||
@ -36,6 +36,7 @@ def partition_html(
|
||||
skip_headers_and_footers: bool = False,
|
||||
detection_origin: Optional[str] = None,
|
||||
html_parser_version: Literal["v1", "v2"] = "v1",
|
||||
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
@ -65,6 +66,9 @@ def partition_html(
|
||||
html_parser_version (Literal['v1', 'v2']):
|
||||
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
|
||||
use the ontology schema to parse the HTML document.
|
||||
|
||||
image_alt_mode (Literal['to_text']):
|
||||
When set 'to_text', the v2 parser will include the alternative text of images in the output.
|
||||
"""
|
||||
# -- parser rejects an empty str, nip that edge-case in the bud here --
|
||||
if text is not None and text.strip() == "" and not file and not filename and not url:
|
||||
@ -81,6 +85,7 @@ def partition_html(
|
||||
skip_headers_and_footers=skip_headers_and_footers,
|
||||
detection_origin=detection_origin,
|
||||
html_parser_version=html_parser_version,
|
||||
image_alt_mode=image_alt_mode,
|
||||
)
|
||||
|
||||
return list(_HtmlPartitioner.iter_elements(opts))
|
||||
@ -102,6 +107,7 @@ class HtmlPartitionerOptions:
|
||||
skip_headers_and_footers: bool,
|
||||
detection_origin: str | None,
|
||||
html_parser_version: Literal["v1", "v2"] = "v1",
|
||||
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
|
||||
):
|
||||
self._file_path = file_path
|
||||
self._file = file
|
||||
@ -113,6 +119,7 @@ class HtmlPartitionerOptions:
|
||||
self._skip_headers_and_footers = skip_headers_and_footers
|
||||
self._detection_origin = detection_origin
|
||||
self._html_parser_version = html_parser_version
|
||||
self._image_alt_mode = image_alt_mode
|
||||
|
||||
@lazyproperty
|
||||
def detection_origin(self) -> str | None:
|
||||
@ -172,6 +179,11 @@ class HtmlPartitionerOptions:
|
||||
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
|
||||
return self._html_parser_version
|
||||
|
||||
@lazyproperty
|
||||
def add_img_alt_text(self) -> bool:
|
||||
"""When True, the alternative text of images is included in the output."""
|
||||
return self._image_alt_mode == "to_text"
|
||||
|
||||
|
||||
class _HtmlPartitioner:
|
||||
"""Partition HTML document into document-elements."""
|
||||
@ -239,5 +251,7 @@ class _HtmlPartitioner:
|
||||
"""Convert an ontology elements represented in HTML to an ontology element."""
|
||||
html_text = self._opts.html_text
|
||||
ontology = parse_html_to_ontology(html_text)
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
unstructured_elements = ontology_to_unstructured_elements(
|
||||
ontology, add_img_alt_text=self._opts.add_img_alt_text
|
||||
)
|
||||
return unstructured_elements
|
||||
|
||||
@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
|
||||
page_number: int = None,
|
||||
depth: int = 0,
|
||||
filename: str | None = None,
|
||||
add_img_alt_text: bool = True,
|
||||
) -> list[elements.Element]:
|
||||
"""
|
||||
Converts an OntologyElement object to a list of unstructured Element objects.
|
||||
@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
|
||||
parent_id (str, optional): The ID of the parent element. Defaults to None.
|
||||
page_number (int, optional): The page number of the element. Defaults to None.
|
||||
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
|
||||
|
||||
filename (str, optional): The name of the file the element comes from. Defaults to None.
|
||||
add_img_alt_text (bool): Whether to include the alternative text of images
|
||||
in the output. Defaults to True.
|
||||
Returns:
|
||||
list[Element]: A list of unstructured Element objects.
|
||||
"""
|
||||
@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
|
||||
page_number=page_number,
|
||||
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
|
||||
filename=filename,
|
||||
add_img_alt_text=add_img_alt_text,
|
||||
)
|
||||
children += child
|
||||
|
||||
@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
|
||||
else:
|
||||
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
|
||||
html_code_of_ontology_element = ontology_element.to_html()
|
||||
element_text = ontology_element.to_text()
|
||||
element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
|
||||
|
||||
unstructured_element = element_class(
|
||||
text=element_text,
|
||||
@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
|
||||
Args:
|
||||
html_code (str): The HTML code to be parsed.
|
||||
Parsing HTML will start from <div class="Page">.
|
||||
|
||||
Returns:
|
||||
OntologyElement: The parsed Element object.
|
||||
|
||||
@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
|
||||
Args:
|
||||
soup (Tag): The BeautifulSoup Tag object to be converted.
|
||||
recursion_depth (int): Flag to control limit of recursion depth.
|
||||
|
||||
Returns:
|
||||
OntologyElement: The converted OntologyElement object.
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user