mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	image alt support (#3797)
This commit is contained in:
		
							parent
							
								
									626f73af5b
								
							
						
					
					
						commit
						e48d79eca1
					
				| @ -1,3 +1,12 @@ | |||||||
|  | ## 0.16.7 | ||||||
|  | 
 | ||||||
|  | ### Enhancements | ||||||
|  | - **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags | ||||||
|  | 
 | ||||||
|  | ### Features | ||||||
|  | 
 | ||||||
|  | ### Fixes | ||||||
|  | 
 | ||||||
| ## 0.16.6 | ## 0.16.6 | ||||||
| 
 | 
 | ||||||
| ### Enhancements | ### Enhancements | ||||||
|  | |||||||
| @ -0,0 +1,8 @@ | |||||||
|  | <body class="Document" id="897a8a47377c4ad6aab839a929879537"> | ||||||
|  |  <div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136"> | ||||||
|  |    <header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d"> | ||||||
|  |     <img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/> | ||||||
|  |     <img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/> | ||||||
|  |    </header> | ||||||
|  |  </div> | ||||||
|  | </body> | ||||||
| @ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): | |||||||
|     [ |     [ | ||||||
|         ("html_files/example.html", "unstructured_json_output/example.json"), |         ("html_files/example.html", "unstructured_json_output/example.json"), | ||||||
|         ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), |         ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), | ||||||
|  |         ( | ||||||
|  |             "html_files/example_with_alternative_text.html", | ||||||
|  |             "unstructured_json_output/example_with_alternative_text.json", | ||||||
|  |         ), | ||||||
|         ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), |         ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), | ||||||
|         ( |         ( | ||||||
|             "html_files/example_with_inline_fields.html", |             "html_files/example_with_inline_fields.html", | ||||||
| @ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): | |||||||
| def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): | def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): | ||||||
|     html_file_path = Path(__file__).parent / html_file_path |     html_file_path = Path(__file__).parent / html_file_path | ||||||
|     json_file_path = Path(__file__).parent / json_file_path |     json_file_path = Path(__file__).parent / json_file_path | ||||||
| 
 |  | ||||||
|     expected_json_elements = elements_from_json(str(json_file_path)) |     expected_json_elements = elements_from_json(str(json_file_path)) | ||||||
|     html_code = html_file_path.read_text() |     html_code = html_file_path.read_text() | ||||||
| 
 | 
 | ||||||
|     predicted_elements = partition_html( |     predicted_elements = partition_html( | ||||||
|         text=html_code, html_parser_version="v2", unique_element_ids=True |         text=html_code, html_parser_version="v2", unique_element_ids=True | ||||||
|     ) |     ) | ||||||
|  | 
 | ||||||
|     assert len(expected_json_elements) == len(predicted_elements) |     assert len(expected_json_elements) == len(predicted_elements) | ||||||
| 
 | 
 | ||||||
|     for i in range(len(expected_json_elements)): |     for i in range(len(expected_json_elements)): | ||||||
|  | |||||||
| @ -0,0 +1,62 @@ | |||||||
|  | [ | ||||||
|  |     { | ||||||
|  |         "element_id": "3a6b156a81764e17be128264241f8136", | ||||||
|  |         "metadata": { | ||||||
|  |             "category_depth": 0, | ||||||
|  |             "filetype": "text/html", | ||||||
|  |             "languages": [ | ||||||
|  |                 "eng" | ||||||
|  |             ], | ||||||
|  |             "page_number": 1, | ||||||
|  |             "parent_id": "897a8a47377c4ad6aab839a929879537", | ||||||
|  |             "text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />" | ||||||
|  |         }, | ||||||
|  |         "text": "", | ||||||
|  |         "type": "UncategorizedText" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", | ||||||
|  |         "metadata": { | ||||||
|  |             "category_depth": 1, | ||||||
|  |             "filetype": "text/html", | ||||||
|  |             "languages": [ | ||||||
|  |                 "eng" | ||||||
|  |             ], | ||||||
|  |             "page_number": 1, | ||||||
|  |             "parent_id": "3a6b156a81764e17be128264241f8136", | ||||||
|  |             "text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />" | ||||||
|  |         }, | ||||||
|  |         "text": "", | ||||||
|  |         "type": "UncategorizedText" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f", | ||||||
|  |         "metadata": { | ||||||
|  |             "category_depth": 2, | ||||||
|  |             "filetype": "text/html", | ||||||
|  |             "languages": [ | ||||||
|  |                 "eng" | ||||||
|  |             ], | ||||||
|  |             "page_number": 1, | ||||||
|  |             "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", | ||||||
|  |             "text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />" | ||||||
|  |         }, | ||||||
|  |         "text": "New York logo", | ||||||
|  |         "type": "Image" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         "element_id": "40c32fd8-9a02-42b8-a587-884293881090", | ||||||
|  |         "metadata": { | ||||||
|  |             "category_depth": 2, | ||||||
|  |             "filetype": "text/html", | ||||||
|  |             "languages": [ | ||||||
|  |                 "eng" | ||||||
|  |             ], | ||||||
|  |             "page_number": 1, | ||||||
|  |             "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", | ||||||
|  |             "text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />" | ||||||
|  |         }, | ||||||
|  |         "text": "A line graph showing the comparison of 5 year cumulative total return for stocks", | ||||||
|  |         "type": "Image" | ||||||
|  |     } | ||||||
|  | ] | ||||||
| @ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs(): | |||||||
|     assert len(unstructured_elements) == 2 |     assert len(unstructured_elements) == 2 | ||||||
|     assert isinstance(unstructured_elements[0], Text) |     assert isinstance(unstructured_elements[0], Text) | ||||||
|     assert isinstance(unstructured_elements[1], NarrativeText) |     assert isinstance(unstructured_elements[1], NarrativeText) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_alternate_text_from_image_is_passed(): | ||||||
|  |     # language=HTML | ||||||
|  |     input_html = """ | ||||||
|  |     <div class="Page"> | ||||||
|  |     <table> | ||||||
|  |         <tr> | ||||||
|  |             <td rowspan="2">Example image nested in the table:</td> | ||||||
|  |             <td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td> | ||||||
|  |         </tr> | ||||||
|  |     </table> | ||||||
|  |     </div>add_img_alt_text | ||||||
|  |     """ | ||||||
|  |     page = parse_html_to_ontology(input_html) | ||||||
|  |     unstructured_elements = ontology_to_unstructured_elements(page) | ||||||
|  |     assert len(unstructured_elements) == 2 | ||||||
|  |     assert "ALT TEXT Logo" in unstructured_elements[1].text | ||||||
|  | |||||||
							
								
								
									
										47
									
								
								test_unstructured/partition/html/test_partition_v2.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								test_unstructured/partition/html/test_partition_v2.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,47 @@ | |||||||
|  | from unstructured.partition.html import partition_html | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_alternative_image_text_can_be_included(): | ||||||
|  |     # language=HTML | ||||||
|  |     html = """ | ||||||
|  |     <div class="Page"> | ||||||
|  |         <img src="my-logo.png" alt="ALT TEXT Logo"/> | ||||||
|  |     </div> | ||||||
|  |     """ | ||||||
|  |     _, image_to_text_alt_mode = partition_html( | ||||||
|  |         text=html, | ||||||
|  |         image_alt_mode="to_text", | ||||||
|  |         html_parser_version="v2", | ||||||
|  |     ) | ||||||
|  |     assert "ALT TEXT Logo" in image_to_text_alt_mode.text | ||||||
|  | 
 | ||||||
|  |     _, image_none_alt_mode = partition_html( | ||||||
|  |         text=html, | ||||||
|  |         image_alt_mode=None, | ||||||
|  |         html_parser_version="v2", | ||||||
|  |     ) | ||||||
|  |     assert "ALT TEXT Logo" not in image_none_alt_mode.text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_alternative_image_text_can_be_included_when_nested_in_paragraph(): | ||||||
|  |     # language=HTML | ||||||
|  |     html = """ | ||||||
|  |     <div class="Page"> | ||||||
|  |         <p class="Paragraph"> | ||||||
|  |             <img src="my-logo.png" alt="ALT TEXT Logo"/> | ||||||
|  |         </p> | ||||||
|  |     </div> | ||||||
|  |     """ | ||||||
|  |     _, paragraph_to_text_alt_mode = partition_html( | ||||||
|  |         text=html, | ||||||
|  |         image_alt_mode="to_text", | ||||||
|  |         html_parser_version="v2", | ||||||
|  |     ) | ||||||
|  |     assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text | ||||||
|  | 
 | ||||||
|  |     _, paragraph_none_alt_mode = partition_html( | ||||||
|  |         text=html, | ||||||
|  |         image_alt_mode=None, | ||||||
|  |         html_parser_version="v2", | ||||||
|  |     ) | ||||||
|  |     assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text | ||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.16.6"  # pragma: no cover | __version__ = "0.16.7"  # pragma: no cover | ||||||
|  | |||||||
| @ -89,11 +89,27 @@ class OntologyElement(BaseModel): | |||||||
| 
 | 
 | ||||||
|         return result_html |         return result_html | ||||||
| 
 | 
 | ||||||
|     def to_text(self, add_children=True) -> str: |     def to_text(self, add_children=True, add_img_alt_text=True) -> str: | ||||||
|  |         """ | ||||||
|  |         Returns the text representation of the element. | ||||||
|  | 
 | ||||||
|  |         Args: | ||||||
|  |             add_children: If True, the text of the children will be included. | ||||||
|  |                             Otherwise, element is represented as single self-closing tag. | ||||||
|  |             add_img_alt_text: If True, the alt text of the image will be included. | ||||||
|  |         """ | ||||||
|         if self.children and add_children: |         if self.children and add_children: | ||||||
|             children_text = " ".join(child.to_text().strip() for child in self.children) |             children_text = " ".join( | ||||||
|  |                 child.to_text(add_children, add_img_alt_text).strip() for child in self.children | ||||||
|  |             ) | ||||||
|             return children_text |             return children_text | ||||||
|         return BeautifulSoup(self.to_html(), "html.parser").get_text().strip() | 
 | ||||||
|  |         text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip() | ||||||
|  | 
 | ||||||
|  |         if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes: | ||||||
|  |             text += f" {self.additional_attributes.get('alt', '')}" | ||||||
|  | 
 | ||||||
|  |         return text.strip() | ||||||
| 
 | 
 | ||||||
|     def _construct_attribute_string(self, attributes: dict) -> str: |     def _construct_attribute_string(self, attributes: dict) -> str: | ||||||
|         return " ".join( |         return " ".join( | ||||||
| @ -473,8 +489,8 @@ class FormFieldValue(OntologyElement): | |||||||
|     elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) |     elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) | ||||||
|     allowed_tags: List[str] = Field(["input"], frozen=True) |     allowed_tags: List[str] = Field(["input"], frozen=True) | ||||||
| 
 | 
 | ||||||
|     def to_text(self, add_children=True) -> str: |     def to_text(self, add_children=True, add_img_alt_text=True) -> str: | ||||||
|         text = super().to_text() |         text = super().to_text(add_children, add_img_alt_text) | ||||||
|         value = self.additional_attributes.get("value", "") |         value = self.additional_attributes.get("value", "") | ||||||
|         if not value: |         if not value: | ||||||
|             return text |             return text | ||||||
|  | |||||||
| @ -36,6 +36,7 @@ def partition_html( | |||||||
|     skip_headers_and_footers: bool = False, |     skip_headers_and_footers: bool = False, | ||||||
|     detection_origin: Optional[str] = None, |     detection_origin: Optional[str] = None, | ||||||
|     html_parser_version: Literal["v1", "v2"] = "v1", |     html_parser_version: Literal["v1", "v2"] = "v1", | ||||||
|  |     image_alt_mode: Optional[Literal["to_text"]] = "to_text", | ||||||
|     **kwargs: Any, |     **kwargs: Any, | ||||||
| ) -> list[Element]: | ) -> list[Element]: | ||||||
|     """Partitions an HTML document into its constituent elements. |     """Partitions an HTML document into its constituent elements. | ||||||
| @ -65,6 +66,9 @@ def partition_html( | |||||||
|     html_parser_version (Literal['v1', 'v2']): |     html_parser_version (Literal['v1', 'v2']): | ||||||
|         The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will |         The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will | ||||||
|         use the ontology schema to parse the HTML document. |         use the ontology schema to parse the HTML document. | ||||||
|  | 
 | ||||||
|  |     image_alt_mode (Literal['to_text']): | ||||||
|  |         When set 'to_text', the v2 parser will include the alternative text of images in the output. | ||||||
|     """ |     """ | ||||||
|     # -- parser rejects an empty str, nip that edge-case in the bud here -- |     # -- parser rejects an empty str, nip that edge-case in the bud here -- | ||||||
|     if text is not None and text.strip() == "" and not file and not filename and not url: |     if text is not None and text.strip() == "" and not file and not filename and not url: | ||||||
| @ -81,6 +85,7 @@ def partition_html( | |||||||
|         skip_headers_and_footers=skip_headers_and_footers, |         skip_headers_and_footers=skip_headers_and_footers, | ||||||
|         detection_origin=detection_origin, |         detection_origin=detection_origin, | ||||||
|         html_parser_version=html_parser_version, |         html_parser_version=html_parser_version, | ||||||
|  |         image_alt_mode=image_alt_mode, | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     return list(_HtmlPartitioner.iter_elements(opts)) |     return list(_HtmlPartitioner.iter_elements(opts)) | ||||||
| @ -102,6 +107,7 @@ class HtmlPartitionerOptions: | |||||||
|         skip_headers_and_footers: bool, |         skip_headers_and_footers: bool, | ||||||
|         detection_origin: str | None, |         detection_origin: str | None, | ||||||
|         html_parser_version: Literal["v1", "v2"] = "v1", |         html_parser_version: Literal["v1", "v2"] = "v1", | ||||||
|  |         image_alt_mode: Optional[Literal["to_text"]] = "to_text", | ||||||
|     ): |     ): | ||||||
|         self._file_path = file_path |         self._file_path = file_path | ||||||
|         self._file = file |         self._file = file | ||||||
| @ -113,6 +119,7 @@ class HtmlPartitionerOptions: | |||||||
|         self._skip_headers_and_footers = skip_headers_and_footers |         self._skip_headers_and_footers = skip_headers_and_footers | ||||||
|         self._detection_origin = detection_origin |         self._detection_origin = detection_origin | ||||||
|         self._html_parser_version = html_parser_version |         self._html_parser_version = html_parser_version | ||||||
|  |         self._image_alt_mode = image_alt_mode | ||||||
| 
 | 
 | ||||||
|     @lazyproperty |     @lazyproperty | ||||||
|     def detection_origin(self) -> str | None: |     def detection_origin(self) -> str | None: | ||||||
| @ -172,6 +179,11 @@ class HtmlPartitionerOptions: | |||||||
|         """When html_parser_version=='v2', HTML elements follow ontology schema.""" |         """When html_parser_version=='v2', HTML elements follow ontology schema.""" | ||||||
|         return self._html_parser_version |         return self._html_parser_version | ||||||
| 
 | 
 | ||||||
|  |     @lazyproperty | ||||||
|  |     def add_img_alt_text(self) -> bool: | ||||||
|  |         """When True, the alternative text of images is included in the output.""" | ||||||
|  |         return self._image_alt_mode == "to_text" | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class _HtmlPartitioner: | class _HtmlPartitioner: | ||||||
|     """Partition HTML document into document-elements.""" |     """Partition HTML document into document-elements.""" | ||||||
| @ -239,5 +251,7 @@ class _HtmlPartitioner: | |||||||
|         """Convert an ontology elements represented in HTML to an ontology element.""" |         """Convert an ontology elements represented in HTML to an ontology element.""" | ||||||
|         html_text = self._opts.html_text |         html_text = self._opts.html_text | ||||||
|         ontology = parse_html_to_ontology(html_text) |         ontology = parse_html_to_ontology(html_text) | ||||||
|         unstructured_elements = ontology_to_unstructured_elements(ontology) |         unstructured_elements = ontology_to_unstructured_elements( | ||||||
|  |             ontology, add_img_alt_text=self._opts.add_img_alt_text | ||||||
|  |         ) | ||||||
|         return unstructured_elements |         return unstructured_elements | ||||||
|  | |||||||
| @ -24,6 +24,7 @@ def ontology_to_unstructured_elements( | |||||||
|     page_number: int = None, |     page_number: int = None, | ||||||
|     depth: int = 0, |     depth: int = 0, | ||||||
|     filename: str | None = None, |     filename: str | None = None, | ||||||
|  |     add_img_alt_text: bool = True, | ||||||
| ) -> list[elements.Element]: | ) -> list[elements.Element]: | ||||||
|     """ |     """ | ||||||
|     Converts an OntologyElement object to a list of unstructured Element objects. |     Converts an OntologyElement object to a list of unstructured Element objects. | ||||||
| @ -44,7 +45,9 @@ def ontology_to_unstructured_elements( | |||||||
|         parent_id (str, optional): The ID of the parent element. Defaults to None. |         parent_id (str, optional): The ID of the parent element. Defaults to None. | ||||||
|         page_number (int, optional): The page number of the element. Defaults to None. |         page_number (int, optional): The page number of the element. Defaults to None. | ||||||
|         depth (int, optional): The depth of the element in the hierarchy. Defaults to 0. |         depth (int, optional): The depth of the element in the hierarchy. Defaults to 0. | ||||||
| 
 |         filename (str, optional): The name of the file the element comes from. Defaults to None. | ||||||
|  |         add_img_alt_text (bool): Whether to include the alternative text of images | ||||||
|  |                                             in the output. Defaults to True. | ||||||
|     Returns: |     Returns: | ||||||
|         list[Element]: A list of unstructured Element objects. |         list[Element]: A list of unstructured Element objects. | ||||||
|     """ |     """ | ||||||
| @ -77,6 +80,7 @@ def ontology_to_unstructured_elements( | |||||||
|                 page_number=page_number, |                 page_number=page_number, | ||||||
|                 depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1, |                 depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1, | ||||||
|                 filename=filename, |                 filename=filename, | ||||||
|  |                 add_img_alt_text=add_img_alt_text, | ||||||
|             ) |             ) | ||||||
|             children += child |             children += child | ||||||
| 
 | 
 | ||||||
| @ -85,7 +89,7 @@ def ontology_to_unstructured_elements( | |||||||
|     else: |     else: | ||||||
|         element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] |         element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] | ||||||
|         html_code_of_ontology_element = ontology_element.to_html() |         html_code_of_ontology_element = ontology_element.to_html() | ||||||
|         element_text = ontology_element.to_text() |         element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text) | ||||||
| 
 | 
 | ||||||
|         unstructured_element = element_class( |         unstructured_element = element_class( | ||||||
|             text=element_text, |             text=element_text, | ||||||
| @ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement: | |||||||
|     Args: |     Args: | ||||||
|         html_code (str): The HTML code to be parsed. |         html_code (str): The HTML code to be parsed. | ||||||
|             Parsing HTML will start from <div class="Page">. |             Parsing HTML will start from <div class="Page">. | ||||||
| 
 |  | ||||||
|     Returns: |     Returns: | ||||||
|         OntologyElement: The parsed Element object. |         OntologyElement: The parsed Element object. | ||||||
| 
 | 
 | ||||||
| @ -352,7 +355,6 @@ def parse_html_to_ontology_element( | |||||||
|     Args: |     Args: | ||||||
|         soup (Tag): The BeautifulSoup Tag object to be converted. |         soup (Tag): The BeautifulSoup Tag object to be converted. | ||||||
|         recursion_depth (int): Flag to control limit of recursion depth. |         recursion_depth (int): Flag to control limit of recursion depth. | ||||||
| 
 |  | ||||||
|     Returns: |     Returns: | ||||||
|         OntologyElement: The converted OntologyElement object. |         OntologyElement: The converted OntologyElement object. | ||||||
|     """ |     """ | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pluto
						Pluto