mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 02:23:36 +00:00
image alt support (#3797)
This commit is contained in:
parent
626f73af5b
commit
e48d79eca1
@ -1,3 +1,12 @@
|
|||||||
|
## 0.16.7
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.16.6
|
## 0.16.6
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -0,0 +1,8 @@
|
|||||||
|
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
|
||||||
|
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
|
||||||
|
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
|
||||||
|
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
|
||||||
|
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
|
||||||
|
</header>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
|
|||||||
[
|
[
|
||||||
("html_files/example.html", "unstructured_json_output/example.json"),
|
("html_files/example.html", "unstructured_json_output/example.json"),
|
||||||
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
|
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
|
||||||
|
(
|
||||||
|
"html_files/example_with_alternative_text.html",
|
||||||
|
"unstructured_json_output/example_with_alternative_text.json",
|
||||||
|
),
|
||||||
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
|
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
|
||||||
(
|
(
|
||||||
"html_files/example_with_inline_fields.html",
|
"html_files/example_with_inline_fields.html",
|
||||||
@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
|
|||||||
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
|
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
|
||||||
html_file_path = Path(__file__).parent / html_file_path
|
html_file_path = Path(__file__).parent / html_file_path
|
||||||
json_file_path = Path(__file__).parent / json_file_path
|
json_file_path = Path(__file__).parent / json_file_path
|
||||||
|
|
||||||
expected_json_elements = elements_from_json(str(json_file_path))
|
expected_json_elements = elements_from_json(str(json_file_path))
|
||||||
html_code = html_file_path.read_text()
|
html_code = html_file_path.read_text()
|
||||||
|
|
||||||
predicted_elements = partition_html(
|
predicted_elements = partition_html(
|
||||||
text=html_code, html_parser_version="v2", unique_element_ids=True
|
text=html_code, html_parser_version="v2", unique_element_ids=True
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(expected_json_elements) == len(predicted_elements)
|
assert len(expected_json_elements) == len(predicted_elements)
|
||||||
|
|
||||||
for i in range(len(expected_json_elements)):
|
for i in range(len(expected_json_elements)):
|
||||||
|
|||||||
@ -0,0 +1,62 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||||
|
"metadata": {
|
||||||
|
"category_depth": 0,
|
||||||
|
"filetype": "text/html",
|
||||||
|
"languages": [
|
||||||
|
"eng"
|
||||||
|
],
|
||||||
|
"page_number": 1,
|
||||||
|
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||||
|
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
|
||||||
|
},
|
||||||
|
"text": "",
|
||||||
|
"type": "UncategorizedText"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||||
|
"metadata": {
|
||||||
|
"category_depth": 1,
|
||||||
|
"filetype": "text/html",
|
||||||
|
"languages": [
|
||||||
|
"eng"
|
||||||
|
],
|
||||||
|
"page_number": 1,
|
||||||
|
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||||
|
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
|
||||||
|
},
|
||||||
|
"text": "",
|
||||||
|
"type": "UncategorizedText"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
|
||||||
|
"metadata": {
|
||||||
|
"category_depth": 2,
|
||||||
|
"filetype": "text/html",
|
||||||
|
"languages": [
|
||||||
|
"eng"
|
||||||
|
],
|
||||||
|
"page_number": 1,
|
||||||
|
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||||
|
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
|
||||||
|
},
|
||||||
|
"text": "New York logo",
|
||||||
|
"type": "Image"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
|
||||||
|
"metadata": {
|
||||||
|
"category_depth": 2,
|
||||||
|
"filetype": "text/html",
|
||||||
|
"languages": [
|
||||||
|
"eng"
|
||||||
|
],
|
||||||
|
"page_number": 1,
|
||||||
|
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||||
|
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
|
||||||
|
},
|
||||||
|
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
|
||||||
|
"type": "Image"
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
|
|||||||
assert len(unstructured_elements) == 2
|
assert len(unstructured_elements) == 2
|
||||||
assert isinstance(unstructured_elements[0], Text)
|
assert isinstance(unstructured_elements[0], Text)
|
||||||
assert isinstance(unstructured_elements[1], NarrativeText)
|
assert isinstance(unstructured_elements[1], NarrativeText)
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternate_text_from_image_is_passed():
|
||||||
|
# language=HTML
|
||||||
|
input_html = """
|
||||||
|
<div class="Page">
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td rowspan="2">Example image nested in the table:</td>
|
||||||
|
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</div>add_img_alt_text
|
||||||
|
"""
|
||||||
|
page = parse_html_to_ontology(input_html)
|
||||||
|
unstructured_elements = ontology_to_unstructured_elements(page)
|
||||||
|
assert len(unstructured_elements) == 2
|
||||||
|
assert "ALT TEXT Logo" in unstructured_elements[1].text
|
||||||
|
|||||||
47
test_unstructured/partition/html/test_partition_v2.py
Normal file
47
test_unstructured/partition/html/test_partition_v2.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternative_image_text_can_be_included():
|
||||||
|
# language=HTML
|
||||||
|
html = """
|
||||||
|
<div class="Page">
|
||||||
|
<img src="my-logo.png" alt="ALT TEXT Logo"/>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
_, image_to_text_alt_mode = partition_html(
|
||||||
|
text=html,
|
||||||
|
image_alt_mode="to_text",
|
||||||
|
html_parser_version="v2",
|
||||||
|
)
|
||||||
|
assert "ALT TEXT Logo" in image_to_text_alt_mode.text
|
||||||
|
|
||||||
|
_, image_none_alt_mode = partition_html(
|
||||||
|
text=html,
|
||||||
|
image_alt_mode=None,
|
||||||
|
html_parser_version="v2",
|
||||||
|
)
|
||||||
|
assert "ALT TEXT Logo" not in image_none_alt_mode.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
|
||||||
|
# language=HTML
|
||||||
|
html = """
|
||||||
|
<div class="Page">
|
||||||
|
<p class="Paragraph">
|
||||||
|
<img src="my-logo.png" alt="ALT TEXT Logo"/>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
_, paragraph_to_text_alt_mode = partition_html(
|
||||||
|
text=html,
|
||||||
|
image_alt_mode="to_text",
|
||||||
|
html_parser_version="v2",
|
||||||
|
)
|
||||||
|
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
|
||||||
|
|
||||||
|
_, paragraph_none_alt_mode = partition_html(
|
||||||
|
text=html,
|
||||||
|
image_alt_mode=None,
|
||||||
|
html_parser_version="v2",
|
||||||
|
)
|
||||||
|
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.16.6" # pragma: no cover
|
__version__ = "0.16.7" # pragma: no cover
|
||||||
|
|||||||
@ -89,11 +89,27 @@ class OntologyElement(BaseModel):
|
|||||||
|
|
||||||
return result_html
|
return result_html
|
||||||
|
|
||||||
def to_text(self, add_children=True) -> str:
|
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
|
||||||
|
"""
|
||||||
|
Returns the text representation of the element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
add_children: If True, the text of the children will be included.
|
||||||
|
Otherwise, element is represented as single self-closing tag.
|
||||||
|
add_img_alt_text: If True, the alt text of the image will be included.
|
||||||
|
"""
|
||||||
if self.children and add_children:
|
if self.children and add_children:
|
||||||
children_text = " ".join(child.to_text().strip() for child in self.children)
|
children_text = " ".join(
|
||||||
|
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
|
||||||
|
)
|
||||||
return children_text
|
return children_text
|
||||||
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
|
|
||||||
|
text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
|
||||||
|
|
||||||
|
if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
|
||||||
|
text += f" {self.additional_attributes.get('alt', '')}"
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
def _construct_attribute_string(self, attributes: dict) -> str:
|
def _construct_attribute_string(self, attributes: dict) -> str:
|
||||||
return " ".join(
|
return " ".join(
|
||||||
@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
|
|||||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
||||||
allowed_tags: List[str] = Field(["input"], frozen=True)
|
allowed_tags: List[str] = Field(["input"], frozen=True)
|
||||||
|
|
||||||
def to_text(self, add_children=True) -> str:
|
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
|
||||||
text = super().to_text()
|
text = super().to_text(add_children, add_img_alt_text)
|
||||||
value = self.additional_attributes.get("value", "")
|
value = self.additional_attributes.get("value", "")
|
||||||
if not value:
|
if not value:
|
||||||
return text
|
return text
|
||||||
|
|||||||
@ -36,6 +36,7 @@ def partition_html(
|
|||||||
skip_headers_and_footers: bool = False,
|
skip_headers_and_footers: bool = False,
|
||||||
detection_origin: Optional[str] = None,
|
detection_origin: Optional[str] = None,
|
||||||
html_parser_version: Literal["v1", "v2"] = "v1",
|
html_parser_version: Literal["v1", "v2"] = "v1",
|
||||||
|
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Partitions an HTML document into its constituent elements.
|
"""Partitions an HTML document into its constituent elements.
|
||||||
@ -65,6 +66,9 @@ def partition_html(
|
|||||||
html_parser_version (Literal['v1', 'v2']):
|
html_parser_version (Literal['v1', 'v2']):
|
||||||
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
|
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
|
||||||
use the ontology schema to parse the HTML document.
|
use the ontology schema to parse the HTML document.
|
||||||
|
|
||||||
|
image_alt_mode (Literal['to_text']):
|
||||||
|
When set 'to_text', the v2 parser will include the alternative text of images in the output.
|
||||||
"""
|
"""
|
||||||
# -- parser rejects an empty str, nip that edge-case in the bud here --
|
# -- parser rejects an empty str, nip that edge-case in the bud here --
|
||||||
if text is not None and text.strip() == "" and not file and not filename and not url:
|
if text is not None and text.strip() == "" and not file and not filename and not url:
|
||||||
@ -81,6 +85,7 @@ def partition_html(
|
|||||||
skip_headers_and_footers=skip_headers_and_footers,
|
skip_headers_and_footers=skip_headers_and_footers,
|
||||||
detection_origin=detection_origin,
|
detection_origin=detection_origin,
|
||||||
html_parser_version=html_parser_version,
|
html_parser_version=html_parser_version,
|
||||||
|
image_alt_mode=image_alt_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
return list(_HtmlPartitioner.iter_elements(opts))
|
return list(_HtmlPartitioner.iter_elements(opts))
|
||||||
@ -102,6 +107,7 @@ class HtmlPartitionerOptions:
|
|||||||
skip_headers_and_footers: bool,
|
skip_headers_and_footers: bool,
|
||||||
detection_origin: str | None,
|
detection_origin: str | None,
|
||||||
html_parser_version: Literal["v1", "v2"] = "v1",
|
html_parser_version: Literal["v1", "v2"] = "v1",
|
||||||
|
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
|
||||||
):
|
):
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._file = file
|
self._file = file
|
||||||
@ -113,6 +119,7 @@ class HtmlPartitionerOptions:
|
|||||||
self._skip_headers_and_footers = skip_headers_and_footers
|
self._skip_headers_and_footers = skip_headers_and_footers
|
||||||
self._detection_origin = detection_origin
|
self._detection_origin = detection_origin
|
||||||
self._html_parser_version = html_parser_version
|
self._html_parser_version = html_parser_version
|
||||||
|
self._image_alt_mode = image_alt_mode
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def detection_origin(self) -> str | None:
|
def detection_origin(self) -> str | None:
|
||||||
@ -172,6 +179,11 @@ class HtmlPartitionerOptions:
|
|||||||
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
|
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
|
||||||
return self._html_parser_version
|
return self._html_parser_version
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def add_img_alt_text(self) -> bool:
|
||||||
|
"""When True, the alternative text of images is included in the output."""
|
||||||
|
return self._image_alt_mode == "to_text"
|
||||||
|
|
||||||
|
|
||||||
class _HtmlPartitioner:
|
class _HtmlPartitioner:
|
||||||
"""Partition HTML document into document-elements."""
|
"""Partition HTML document into document-elements."""
|
||||||
@ -239,5 +251,7 @@ class _HtmlPartitioner:
|
|||||||
"""Convert an ontology elements represented in HTML to an ontology element."""
|
"""Convert an ontology elements represented in HTML to an ontology element."""
|
||||||
html_text = self._opts.html_text
|
html_text = self._opts.html_text
|
||||||
ontology = parse_html_to_ontology(html_text)
|
ontology = parse_html_to_ontology(html_text)
|
||||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
unstructured_elements = ontology_to_unstructured_elements(
|
||||||
|
ontology, add_img_alt_text=self._opts.add_img_alt_text
|
||||||
|
)
|
||||||
return unstructured_elements
|
return unstructured_elements
|
||||||
|
|||||||
@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
|
|||||||
page_number: int = None,
|
page_number: int = None,
|
||||||
depth: int = 0,
|
depth: int = 0,
|
||||||
filename: str | None = None,
|
filename: str | None = None,
|
||||||
|
add_img_alt_text: bool = True,
|
||||||
) -> list[elements.Element]:
|
) -> list[elements.Element]:
|
||||||
"""
|
"""
|
||||||
Converts an OntologyElement object to a list of unstructured Element objects.
|
Converts an OntologyElement object to a list of unstructured Element objects.
|
||||||
@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
|
|||||||
parent_id (str, optional): The ID of the parent element. Defaults to None.
|
parent_id (str, optional): The ID of the parent element. Defaults to None.
|
||||||
page_number (int, optional): The page number of the element. Defaults to None.
|
page_number (int, optional): The page number of the element. Defaults to None.
|
||||||
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
|
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
|
||||||
|
filename (str, optional): The name of the file the element comes from. Defaults to None.
|
||||||
|
add_img_alt_text (bool): Whether to include the alternative text of images
|
||||||
|
in the output. Defaults to True.
|
||||||
Returns:
|
Returns:
|
||||||
list[Element]: A list of unstructured Element objects.
|
list[Element]: A list of unstructured Element objects.
|
||||||
"""
|
"""
|
||||||
@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
|
|||||||
page_number=page_number,
|
page_number=page_number,
|
||||||
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
|
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
|
add_img_alt_text=add_img_alt_text,
|
||||||
)
|
)
|
||||||
children += child
|
children += child
|
||||||
|
|
||||||
@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
|
|||||||
else:
|
else:
|
||||||
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
|
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
|
||||||
html_code_of_ontology_element = ontology_element.to_html()
|
html_code_of_ontology_element = ontology_element.to_html()
|
||||||
element_text = ontology_element.to_text()
|
element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
|
||||||
|
|
||||||
unstructured_element = element_class(
|
unstructured_element = element_class(
|
||||||
text=element_text,
|
text=element_text,
|
||||||
@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
|
|||||||
Args:
|
Args:
|
||||||
html_code (str): The HTML code to be parsed.
|
html_code (str): The HTML code to be parsed.
|
||||||
Parsing HTML will start from <div class="Page">.
|
Parsing HTML will start from <div class="Page">.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
OntologyElement: The parsed Element object.
|
OntologyElement: The parsed Element object.
|
||||||
|
|
||||||
@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
|
|||||||
Args:
|
Args:
|
||||||
soup (Tag): The BeautifulSoup Tag object to be converted.
|
soup (Tag): The BeautifulSoup Tag object to be converted.
|
||||||
recursion_depth (int): Flag to control limit of recursion depth.
|
recursion_depth (int): Flag to control limit of recursion depth.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
OntologyElement: The converted OntologyElement object.
|
OntologyElement: The converted OntologyElement object.
|
||||||
"""
|
"""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user