image alt support (#3797)

2025-10-31 10:03:07 +00:00 · 2024-11-26 17:20:23 +01:00 · 2024-11-26 17:20:23 +01:00 · e48d79eca1
commit e48d79eca1
parent 626f73af5b
10 changed files with 192 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,12 @@
 ## 0.16.7
 ### Enhancements
 - **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
 ### Features
 ### Fixes
 ## 0.16.6
 ### Enhancements
--- a/test_unstructured/documents/html_files/example_with_alternative_text.html
+++ b/test_unstructured/documents/html_files/example_with_alternative_text.html
@ -0,0 +1,8 @@
 <body class="Document" id="897a8a47377c4ad6aab839a929879537">
 <div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
   <header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
    <img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
    <img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
   </header>
 </div>
 </body>
--- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
+++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
    [
        ("html_files/example.html", "unstructured_json_output/example.json"),
        ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
        (
            "html_files/example_with_alternative_text.html",
            "unstructured_json_output/example_with_alternative_text.json",
        ),
        ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
        (
            "html_files/example_with_inline_fields.html",
@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
 def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
    html_file_path = Path(__file__).parent / html_file_path
    json_file_path = Path(__file__).parent / json_file_path
    expected_json_elements = elements_from_json(str(json_file_path))
    html_code = html_file_path.read_text()
    predicted_elements = partition_html(
        text=html_code, html_parser_version="v2", unique_element_ids=True
    )
    assert len(expected_json_elements) == len(predicted_elements)
    for i in range(len(expected_json_elements)):
--- a/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
+++ b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
@ -0,0 +1,62 @@
 [
    {
        "element_id": "3a6b156a81764e17be128264241f8136",
        "metadata": {
            "category_depth": 0,
            "filetype": "text/html",
            "languages": [
                "eng"
            ],
            "page_number": 1,
            "parent_id": "897a8a47377c4ad6aab839a929879537",
            "text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
        },
        "text": "",
        "type": "UncategorizedText"
    },
    {
        "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
        "metadata": {
            "category_depth": 1,
            "filetype": "text/html",
            "languages": [
                "eng"
            ],
            "page_number": 1,
            "parent_id": "3a6b156a81764e17be128264241f8136",
            "text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
        },
        "text": "",
        "type": "UncategorizedText"
    },
    {
        "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
        "metadata": {
            "category_depth": 2,
            "filetype": "text/html",
            "languages": [
                "eng"
            ],
            "page_number": 1,
            "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
            "text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
        },
        "text": "New York logo",
        "type": "Image"
    },
    {
        "element_id": "40c32fd8-9a02-42b8-a587-884293881090",
        "metadata": {
            "category_depth": 2,
            "filetype": "text/html",
            "languages": [
                "eng"
            ],
            "page_number": 1,
            "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
            "text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
        },
        "text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
        "type": "Image"
    }
 ]
--- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
+++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
    assert len(unstructured_elements) == 2
    assert isinstance(unstructured_elements[0], Text)
    assert isinstance(unstructured_elements[1], NarrativeText)
 def test_alternate_text_from_image_is_passed():
    # language=HTML
    input_html = """
    <div class="Page">
    <table>
        <tr>
            <td rowspan="2">Example image nested in the table:</td>
            <td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
        </tr>
    </table>
    </div>add_img_alt_text
    """
    page = parse_html_to_ontology(input_html)
    unstructured_elements = ontology_to_unstructured_elements(page)
    assert len(unstructured_elements) == 2
    assert "ALT TEXT Logo" in unstructured_elements[1].text
--- a/test_unstructured/partition/html/test_partition_v2.py
+++ b/test_unstructured/partition/html/test_partition_v2.py
@ -0,0 +1,47 @@
 from unstructured.partition.html import partition_html
 def test_alternative_image_text_can_be_included():
    # language=HTML
    html = """
    <div class="Page">
        <img src="my-logo.png" alt="ALT TEXT Logo"/>
    </div>
    """
    _, image_to_text_alt_mode = partition_html(
        text=html,
        image_alt_mode="to_text",
        html_parser_version="v2",
    )
    assert "ALT TEXT Logo" in image_to_text_alt_mode.text
    _, image_none_alt_mode = partition_html(
        text=html,
        image_alt_mode=None,
        html_parser_version="v2",
    )
    assert "ALT TEXT Logo" not in image_none_alt_mode.text
 def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
    # language=HTML
    html = """
    <div class="Page">
        <p class="Paragraph">
            <img src="my-logo.png" alt="ALT TEXT Logo"/>
        </p>
    </div>
    """
    _, paragraph_to_text_alt_mode = partition_html(
        text=html,
        image_alt_mode="to_text",
        html_parser_version="v2",
    )
    assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
    _, paragraph_none_alt_mode = partition_html(
        text=html,
        image_alt_mode=None,
        html_parser_version="v2",
    )
    assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.6"  # pragma: no cover
+__version__ = "0.16.7"  # pragma: no cover
--- a/unstructured/documents/ontology.py
+++ b/unstructured/documents/ontology.py
@ -89,11 +89,27 @@ class OntologyElement(BaseModel):
        return result_html
-    def to_text(self, add_children=True) -> str:
+    def to_text(self, add_children=True, add_img_alt_text=True) -> str:
        """
        Returns the text representation of the element.
        Args:
            add_children: If True, the text of the children will be included.
                            Otherwise, element is represented as single self-closing tag.
            add_img_alt_text: If True, the alt text of the image will be included.
        """
        if self.children and add_children:
-            children_text = " ".join(child.to_text().strip() for child in self.children)
+            children_text = " ".join(
                child.to_text(add_children, add_img_alt_text).strip() for child in self.children
            )
            return children_text
-        return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
        text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
        if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
            text += f" {self.additional_attributes.get('alt', '')}"
        return text.strip()
    def _construct_attribute_string(self, attributes: dict) -> str:
        return " ".join(
@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
    elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
    allowed_tags: List[str] = Field(["input"], frozen=True)
-    def to_text(self, add_children=True) -> str:
+    def to_text(self, add_children=True, add_img_alt_text=True) -> str:
-        text = super().to_text()
+        text = super().to_text(add_children, add_img_alt_text)
        value = self.additional_attributes.get("value", "")
        if not value:
            return text
--- a/unstructured/partition/html/partition.py
+++ b/unstructured/partition/html/partition.py
@ -36,6 +36,7 @@ def partition_html(
    skip_headers_and_footers: bool = False,
    detection_origin: Optional[str] = None,
    html_parser_version: Literal["v1", "v2"] = "v1",
    image_alt_mode: Optional[Literal["to_text"]] = "to_text",
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an HTML document into its constituent elements.
@ -65,6 +66,9 @@ def partition_html(
    html_parser_version (Literal['v1', 'v2']):
        The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
        use the ontology schema to parse the HTML document.
    image_alt_mode (Literal['to_text']):
        When set 'to_text', the v2 parser will include the alternative text of images in the output.
    """
    # -- parser rejects an empty str, nip that edge-case in the bud here --
    if text is not None and text.strip() == "" and not file and not filename and not url:
@ -81,6 +85,7 @@ def partition_html(
        skip_headers_and_footers=skip_headers_and_footers,
        detection_origin=detection_origin,
        html_parser_version=html_parser_version,
        image_alt_mode=image_alt_mode,
    )
    return list(_HtmlPartitioner.iter_elements(opts))
@ -102,6 +107,7 @@ class HtmlPartitionerOptions:
        skip_headers_and_footers: bool,
        detection_origin: str | None,
        html_parser_version: Literal["v1", "v2"] = "v1",
        image_alt_mode: Optional[Literal["to_text"]] = "to_text",
    ):
        self._file_path = file_path
        self._file = file
@ -113,6 +119,7 @@ class HtmlPartitionerOptions:
        self._skip_headers_and_footers = skip_headers_and_footers
        self._detection_origin = detection_origin
        self._html_parser_version = html_parser_version
        self._image_alt_mode = image_alt_mode
    @lazyproperty
    def detection_origin(self) -> str | None:
@ -172,6 +179,11 @@ class HtmlPartitionerOptions:
        """When html_parser_version=='v2', HTML elements follow ontology schema."""
        return self._html_parser_version
    @lazyproperty
    def add_img_alt_text(self) -> bool:
        """When True, the alternative text of images is included in the output."""
        return self._image_alt_mode == "to_text"
 class _HtmlPartitioner:
    """Partition HTML document into document-elements."""
@ -239,5 +251,7 @@ class _HtmlPartitioner:
        """Convert an ontology elements represented in HTML to an ontology element."""
        html_text = self._opts.html_text
        ontology = parse_html_to_ontology(html_text)
-        unstructured_elements = ontology_to_unstructured_elements(ontology)
+        unstructured_elements = ontology_to_unstructured_elements(
            ontology, add_img_alt_text=self._opts.add_img_alt_text
        )
        return unstructured_elements
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
    page_number: int = None,
    depth: int = 0,
    filename: str | None = None,
    add_img_alt_text: bool = True,
 ) -> list[elements.Element]:
    """
    Converts an OntologyElement object to a list of unstructured Element objects.
@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
        parent_id (str, optional): The ID of the parent element. Defaults to None.
        page_number (int, optional): The page number of the element. Defaults to None.
        depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
-
+        filename (str, optional): The name of the file the element comes from. Defaults to None.
        add_img_alt_text (bool): Whether to include the alternative text of images
                                            in the output. Defaults to True.
    Returns:
        list[Element]: A list of unstructured Element objects.
    """
@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
                page_number=page_number,
                depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
                filename=filename,
                add_img_alt_text=add_img_alt_text,
            )
            children += child
@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
    else:
        element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
        html_code_of_ontology_element = ontology_element.to_html()
-        element_text = ontology_element.to_text()
+        element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
        unstructured_element = element_class(
            text=element_text,
@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
    Args:
        html_code (str): The HTML code to be parsed.
            Parsing HTML will start from <div class="Page">.
    Returns:
        OntologyElement: The parsed Element object.
@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
    Args:
        soup (Tag): The BeautifulSoup Tag object to be converted.
        recursion_depth (int): Flag to control limit of recursion depth.
    Returns:
        OntologyElement: The converted OntologyElement object.
    """
`@ -1 +1 @@`
	`__version__ = "0.16.6" # pragma: no cover`	`__version__ = "0.16.7" # pragma: no cover`