image alt support (#3797)

2025-10-31 01:54:25 +00:00 · 2024-11-26 17:20:23 +01:00 · 2024-11-26 17:20:23 +01:00 · e48d79eca1
commit e48d79eca1
parent 626f73af5b
10 changed files with 192 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,12 @@
+## 0.16.7
+
+### Enhancements
+- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
+
+### Features
+
+### Fixes
+
 ## 0.16.6

 ### Enhancements
--- a/test_unstructured/documents/html_files/example_with_alternative_text.html
+++ b/test_unstructured/documents/html_files/example_with_alternative_text.html
@ -0,0 +1,8 @@
+<body class="Document" id="897a8a47377c4ad6aab839a929879537">
+ <div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
+   <header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
+    <img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
+    <img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
+   </header>
+ </div>
+</body>
--- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
+++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
    [
        ("html_files/example.html", "unstructured_json_output/example.json"),
        ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
+        (
+            "html_files/example_with_alternative_text.html",
+            "unstructured_json_output/example_with_alternative_text.json",
+        ),
        ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
        (
            "html_files/example_with_inline_fields.html",
@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
 def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
    html_file_path = Path(__file__).parent / html_file_path
    json_file_path = Path(__file__).parent / json_file_path
-
    expected_json_elements = elements_from_json(str(json_file_path))
    html_code = html_file_path.read_text()

    predicted_elements = partition_html(
        text=html_code, html_parser_version="v2", unique_element_ids=True
    )
+
    assert len(expected_json_elements) == len(predicted_elements)

    for i in range(len(expected_json_elements)):
--- a/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
+++ b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
@ -0,0 +1,62 @@
+[
+    {
+        "element_id": "3a6b156a81764e17be128264241f8136",
+        "metadata": {
+            "category_depth": 0,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "897a8a47377c4ad6aab839a929879537",
+            "text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
+        },
+        "text": "",
+        "type": "UncategorizedText"
+    },
+    {
+        "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+        "metadata": {
+            "category_depth": 1,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "3a6b156a81764e17be128264241f8136",
+            "text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
+        },
+        "text": "",
+        "type": "UncategorizedText"
+    },
+    {
+        "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
+        "metadata": {
+            "category_depth": 2,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+            "text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
+        },
+        "text": "New York logo",
+        "type": "Image"
+    },
+    {
+        "element_id": "40c32fd8-9a02-42b8-a587-884293881090",
+        "metadata": {
+            "category_depth": 2,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+            "text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
+        },
+        "text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
+        "type": "Image"
+    }
+]
--- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
+++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
    assert len(unstructured_elements) == 2
    assert isinstance(unstructured_elements[0], Text)
    assert isinstance(unstructured_elements[1], NarrativeText)
+
+
+def test_alternate_text_from_image_is_passed():
+    # language=HTML
+    input_html = """
+    <div class="Page">
+    <table>
+        <tr>
+            <td rowspan="2">Example image nested in the table:</td>
+            <td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
+        </tr>
+    </table>
+    </div>add_img_alt_text
+    """
+    page = parse_html_to_ontology(input_html)
+    unstructured_elements = ontology_to_unstructured_elements(page)
+    assert len(unstructured_elements) == 2
+    assert "ALT TEXT Logo" in unstructured_elements[1].text
--- a/test_unstructured/partition/html/test_partition_v2.py
+++ b/test_unstructured/partition/html/test_partition_v2.py
@ -0,0 +1,47 @@
+from unstructured.partition.html import partition_html
+
+
+def test_alternative_image_text_can_be_included():
+    # language=HTML
+    html = """
+    <div class="Page">
+        <img src="my-logo.png" alt="ALT TEXT Logo"/>
+    </div>
+    """
+    _, image_to_text_alt_mode = partition_html(
+        text=html,
+        image_alt_mode="to_text",
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" in image_to_text_alt_mode.text
+
+    _, image_none_alt_mode = partition_html(
+        text=html,
+        image_alt_mode=None,
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" not in image_none_alt_mode.text
+
+
+def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
+    # language=HTML
+    html = """
+    <div class="Page">
+        <p class="Paragraph">
+            <img src="my-logo.png" alt="ALT TEXT Logo"/>
+        </p>
+    </div>
+    """
+    _, paragraph_to_text_alt_mode = partition_html(
+        text=html,
+        image_alt_mode="to_text",
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
+
+    _, paragraph_none_alt_mode = partition_html(
+        text=html,
+        image_alt_mode=None,
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.6"  # pragma: no cover
+__version__ = "0.16.7"  # pragma: no cover
--- a/unstructured/documents/ontology.py
+++ b/unstructured/documents/ontology.py
@ -89,11 +89,27 @@ class OntologyElement(BaseModel):

        return result_html

-    def to_text(self, add_children=True) -> str:
+    def to_text(self, add_children=True, add_img_alt_text=True) -> str:
+        """
+        Returns the text representation of the element.
+
+        Args:
+            add_children: If True, the text of the children will be included.
+                            Otherwise, element is represented as single self-closing tag.
+            add_img_alt_text: If True, the alt text of the image will be included.
+        """
        if self.children and add_children:
-            children_text = " ".join(child.to_text().strip() for child in self.children)
+            children_text = " ".join(
+                child.to_text(add_children, add_img_alt_text).strip() for child in self.children
+            )
            return children_text
-        return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
+        text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
+        if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
+            text += f" {self.additional_attributes.get('alt', '')}"
+
+        return text.strip()

    def _construct_attribute_string(self, attributes: dict) -> str:
        return " ".join(
@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
    elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
    allowed_tags: List[str] = Field(["input"], frozen=True)

-    def to_text(self, add_children=True) -> str:
-        text = super().to_text()
+    def to_text(self, add_children=True, add_img_alt_text=True) -> str:
+        text = super().to_text(add_children, add_img_alt_text)
        value = self.additional_attributes.get("value", "")
        if not value:
            return text
--- a/unstructured/partition/html/partition.py
+++ b/unstructured/partition/html/partition.py
@ -36,6 +36,7 @@ def partition_html(
    skip_headers_and_footers: bool = False,
    detection_origin: Optional[str] = None,
    html_parser_version: Literal["v1", "v2"] = "v1",
+    image_alt_mode: Optional[Literal["to_text"]] = "to_text",
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an HTML document into its constituent elements.
@ -65,6 +66,9 @@ def partition_html(
    html_parser_version (Literal['v1', 'v2']):
        The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
        use the ontology schema to parse the HTML document.
+
+    image_alt_mode (Literal['to_text']):
+        When set 'to_text', the v2 parser will include the alternative text of images in the output.
    """
    # -- parser rejects an empty str, nip that edge-case in the bud here --
    if text is not None and text.strip() == "" and not file and not filename and not url:
@ -81,6 +85,7 @@ def partition_html(
        skip_headers_and_footers=skip_headers_and_footers,
        detection_origin=detection_origin,
        html_parser_version=html_parser_version,
+        image_alt_mode=image_alt_mode,
    )

    return list(_HtmlPartitioner.iter_elements(opts))
@ -102,6 +107,7 @@ class HtmlPartitionerOptions:
        skip_headers_and_footers: bool,
        detection_origin: str | None,
        html_parser_version: Literal["v1", "v2"] = "v1",
+        image_alt_mode: Optional[Literal["to_text"]] = "to_text",
    ):
        self._file_path = file_path
        self._file = file
@ -113,6 +119,7 @@ class HtmlPartitionerOptions:
        self._skip_headers_and_footers = skip_headers_and_footers
        self._detection_origin = detection_origin
        self._html_parser_version = html_parser_version
+        self._image_alt_mode = image_alt_mode

    @lazyproperty
    def detection_origin(self) -> str | None:
@ -172,6 +179,11 @@ class HtmlPartitionerOptions:
        """When html_parser_version=='v2', HTML elements follow ontology schema."""
        return self._html_parser_version

+    @lazyproperty
+    def add_img_alt_text(self) -> bool:
+        """When True, the alternative text of images is included in the output."""
+        return self._image_alt_mode == "to_text"
+

 class _HtmlPartitioner:
    """Partition HTML document into document-elements."""
@ -239,5 +251,7 @@ class _HtmlPartitioner:
        """Convert an ontology elements represented in HTML to an ontology element."""
        html_text = self._opts.html_text
        ontology = parse_html_to_ontology(html_text)
-        unstructured_elements = ontology_to_unstructured_elements(ontology)
+        unstructured_elements = ontology_to_unstructured_elements(
+            ontology, add_img_alt_text=self._opts.add_img_alt_text
+        )
        return unstructured_elements
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
    page_number: int = None,
    depth: int = 0,
    filename: str | None = None,
+    add_img_alt_text: bool = True,
 ) -> list[elements.Element]:
    """
    Converts an OntologyElement object to a list of unstructured Element objects.
@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
        parent_id (str, optional): The ID of the parent element. Defaults to None.
        page_number (int, optional): The page number of the element. Defaults to None.
        depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
-
+        filename (str, optional): The name of the file the element comes from. Defaults to None.
+        add_img_alt_text (bool): Whether to include the alternative text of images
+                                            in the output. Defaults to True.
    Returns:
        list[Element]: A list of unstructured Element objects.
    """
@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
                page_number=page_number,
                depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
                filename=filename,
+                add_img_alt_text=add_img_alt_text,
            )
            children += child

@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
    else:
        element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
        html_code_of_ontology_element = ontology_element.to_html()
-        element_text = ontology_element.to_text()
+        element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)

        unstructured_element = element_class(
            text=element_text,
@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
    Args:
        html_code (str): The HTML code to be parsed.
            Parsing HTML will start from <div class="Page">.
-
    Returns:
        OntologyElement: The parsed Element object.

@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
    Args:
        soup (Tag): The BeautifulSoup Tag object to be converted.
        recursion_depth (int): Flag to control limit of recursion depth.
-
    Returns:
        OntologyElement: The converted OntologyElement object.
    """