image alt support (#3797)

This commit is contained in:
Pluto 2024-11-26 17:20:23 +01:00 committed by GitHub
parent 626f73af5b
commit e48d79eca1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 192 additions and 12 deletions

View File

@ -1,3 +1,12 @@
## 0.16.7
### Enhancements
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
### Features
### Fixes
## 0.16.6 ## 0.16.6
### Enhancements ### Enhancements

View File

@ -0,0 +1,8 @@
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
</header>
</div>
</body>

View File

@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
[ [
("html_files/example.html", "unstructured_json_output/example.json"), ("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
(
"html_files/example_with_alternative_text.html",
"unstructured_json_output/example_with_alternative_text.json",
),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
( (
"html_files/example_with_inline_fields.html", "html_files/example_with_inline_fields.html",
@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path json_file_path = Path(__file__).parent / json_file_path
expected_json_elements = elements_from_json(str(json_file_path)) expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text() html_code = html_file_path.read_text()
predicted_elements = partition_html( predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True text=html_code, html_parser_version="v2", unique_element_ids=True
) )
assert len(expected_json_elements) == len(predicted_elements) assert len(expected_json_elements) == len(predicted_elements)
for i in range(len(expected_json_elements)): for i in range(len(expected_json_elements)):

View File

@ -0,0 +1,62 @@
[
{
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"metadata": {
"category_depth": 1,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
},
"text": "New York logo",
"type": "Image"
},
{
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
},
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
"type": "Image"
}
]

View File

@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
assert len(unstructured_elements) == 2 assert len(unstructured_elements) == 2
assert isinstance(unstructured_elements[0], Text) assert isinstance(unstructured_elements[0], Text)
assert isinstance(unstructured_elements[1], NarrativeText) assert isinstance(unstructured_elements[1], NarrativeText)
def test_alternate_text_from_image_is_passed():
# language=HTML
input_html = """
<div class="Page">
<table>
<tr>
<td rowspan="2">Example image nested in the table:</td>
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
</tr>
</table>
</div>add_img_alt_text
"""
page = parse_html_to_ontology(input_html)
unstructured_elements = ontology_to_unstructured_elements(page)
assert len(unstructured_elements) == 2
assert "ALT TEXT Logo" in unstructured_elements[1].text

View File

@ -0,0 +1,47 @@
from unstructured.partition.html import partition_html
def test_alternative_image_text_can_be_included():
# language=HTML
html = """
<div class="Page">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</div>
"""
_, image_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in image_to_text_alt_mode.text
_, image_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in image_none_alt_mode.text
def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
# language=HTML
html = """
<div class="Page">
<p class="Paragraph">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</p>
</div>
"""
_, paragraph_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
_, paragraph_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text

View File

@ -1 +1 @@
__version__ = "0.16.6" # pragma: no cover __version__ = "0.16.7" # pragma: no cover

View File

@ -89,11 +89,27 @@ class OntologyElement(BaseModel):
return result_html return result_html
def to_text(self, add_children=True) -> str: def to_text(self, add_children=True, add_img_alt_text=True) -> str:
"""
Returns the text representation of the element.
Args:
add_children: If True, the text of the children will be included.
Otherwise, element is represented as single self-closing tag.
add_img_alt_text: If True, the alt text of the image will be included.
"""
if self.children and add_children: if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children) children_text = " ".join(
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
)
return children_text return children_text
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
text += f" {self.additional_attributes.get('alt', '')}"
return text.strip()
def _construct_attribute_string(self, attributes: dict) -> str: def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join( return " ".join(
@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True)
def to_text(self, add_children=True) -> str: def to_text(self, add_children=True, add_img_alt_text=True) -> str:
text = super().to_text() text = super().to_text(add_children, add_img_alt_text)
value = self.additional_attributes.get("value", "") value = self.additional_attributes.get("value", "")
if not value: if not value:
return text return text

View File

@ -36,6 +36,7 @@ def partition_html(
skip_headers_and_footers: bool = False, skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None, detection_origin: Optional[str] = None,
html_parser_version: Literal["v1", "v2"] = "v1", html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partitions an HTML document into its constituent elements. """Partitions an HTML document into its constituent elements.
@ -65,6 +66,9 @@ def partition_html(
html_parser_version (Literal['v1', 'v2']): html_parser_version (Literal['v1', 'v2']):
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
use the ontology schema to parse the HTML document. use the ontology schema to parse the HTML document.
image_alt_mode (Literal['to_text']):
When set 'to_text', the v2 parser will include the alternative text of images in the output.
""" """
# -- parser rejects an empty str, nip that edge-case in the bud here -- # -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url: if text is not None and text.strip() == "" and not file and not filename and not url:
@ -81,6 +85,7 @@ def partition_html(
skip_headers_and_footers=skip_headers_and_footers, skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin, detection_origin=detection_origin,
html_parser_version=html_parser_version, html_parser_version=html_parser_version,
image_alt_mode=image_alt_mode,
) )
return list(_HtmlPartitioner.iter_elements(opts)) return list(_HtmlPartitioner.iter_elements(opts))
@ -102,6 +107,7 @@ class HtmlPartitionerOptions:
skip_headers_and_footers: bool, skip_headers_and_footers: bool,
detection_origin: str | None, detection_origin: str | None,
html_parser_version: Literal["v1", "v2"] = "v1", html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
): ):
self._file_path = file_path self._file_path = file_path
self._file = file self._file = file
@ -113,6 +119,7 @@ class HtmlPartitionerOptions:
self._skip_headers_and_footers = skip_headers_and_footers self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin self._detection_origin = detection_origin
self._html_parser_version = html_parser_version self._html_parser_version = html_parser_version
self._image_alt_mode = image_alt_mode
@lazyproperty @lazyproperty
def detection_origin(self) -> str | None: def detection_origin(self) -> str | None:
@ -172,6 +179,11 @@ class HtmlPartitionerOptions:
"""When html_parser_version=='v2', HTML elements follow ontology schema.""" """When html_parser_version=='v2', HTML elements follow ontology schema."""
return self._html_parser_version return self._html_parser_version
@lazyproperty
def add_img_alt_text(self) -> bool:
"""When True, the alternative text of images is included in the output."""
return self._image_alt_mode == "to_text"
class _HtmlPartitioner: class _HtmlPartitioner:
"""Partition HTML document into document-elements.""" """Partition HTML document into document-elements."""
@ -239,5 +251,7 @@ class _HtmlPartitioner:
"""Convert an ontology elements represented in HTML to an ontology element.""" """Convert an ontology elements represented in HTML to an ontology element."""
html_text = self._opts.html_text html_text = self._opts.html_text
ontology = parse_html_to_ontology(html_text) ontology = parse_html_to_ontology(html_text)
unstructured_elements = ontology_to_unstructured_elements(ontology) unstructured_elements = ontology_to_unstructured_elements(
ontology, add_img_alt_text=self._opts.add_img_alt_text
)
return unstructured_elements return unstructured_elements

View File

@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
page_number: int = None, page_number: int = None,
depth: int = 0, depth: int = 0,
filename: str | None = None, filename: str | None = None,
add_img_alt_text: bool = True,
) -> list[elements.Element]: ) -> list[elements.Element]:
""" """
Converts an OntologyElement object to a list of unstructured Element objects. Converts an OntologyElement object to a list of unstructured Element objects.
@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
parent_id (str, optional): The ID of the parent element. Defaults to None. parent_id (str, optional): The ID of the parent element. Defaults to None.
page_number (int, optional): The page number of the element. Defaults to None. page_number (int, optional): The page number of the element. Defaults to None.
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0. depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
filename (str, optional): The name of the file the element comes from. Defaults to None.
add_img_alt_text (bool): Whether to include the alternative text of images
in the output. Defaults to True.
Returns: Returns:
list[Element]: A list of unstructured Element objects. list[Element]: A list of unstructured Element objects.
""" """
@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
page_number=page_number, page_number=page_number,
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1, depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename, filename=filename,
add_img_alt_text=add_img_alt_text,
) )
children += child children += child
@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
else: else:
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html() html_code_of_ontology_element = ontology_element.to_html()
element_text = ontology_element.to_text() element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
unstructured_element = element_class( unstructured_element = element_class(
text=element_text, text=element_text,
@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
Args: Args:
html_code (str): The HTML code to be parsed. html_code (str): The HTML code to be parsed.
Parsing HTML will start from <div class="Page">. Parsing HTML will start from <div class="Page">.
Returns: Returns:
OntologyElement: The parsed Element object. OntologyElement: The parsed Element object.
@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
Args: Args:
soup (Tag): The BeautifulSoup Tag object to be converted. soup (Tag): The BeautifulSoup Tag object to be converted.
recursion_depth (int): Flag to control limit of recursion depth. recursion_depth (int): Flag to control limit of recursion depth.
Returns: Returns:
OntologyElement: The converted OntologyElement object. OntologyElement: The converted OntologyElement object.
""" """