mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	image alt support (#3797)
This commit is contained in:
		
							parent
							
								
									626f73af5b
								
							
						
					
					
						commit
						e48d79eca1
					
				| @ -1,3 +1,12 @@ | ||||
| ## 0.16.7 | ||||
| 
 | ||||
| ### Enhancements | ||||
| - **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags | ||||
| 
 | ||||
| ### Features | ||||
| 
 | ||||
| ### Fixes | ||||
| 
 | ||||
| ## 0.16.6 | ||||
| 
 | ||||
| ### Enhancements | ||||
|  | ||||
| @ -0,0 +1,8 @@ | ||||
| <body class="Document" id="897a8a47377c4ad6aab839a929879537"> | ||||
|  <div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136"> | ||||
|    <header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d"> | ||||
|     <img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/> | ||||
|     <img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/> | ||||
|    </header> | ||||
|  </div> | ||||
| </body> | ||||
| @ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): | ||||
|     [ | ||||
|         ("html_files/example.html", "unstructured_json_output/example.json"), | ||||
|         ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), | ||||
|         ( | ||||
|             "html_files/example_with_alternative_text.html", | ||||
|             "unstructured_json_output/example_with_alternative_text.json", | ||||
|         ), | ||||
|         ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), | ||||
|         ( | ||||
|             "html_files/example_with_inline_fields.html", | ||||
| @ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): | ||||
| def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): | ||||
|     html_file_path = Path(__file__).parent / html_file_path | ||||
|     json_file_path = Path(__file__).parent / json_file_path | ||||
| 
 | ||||
|     expected_json_elements = elements_from_json(str(json_file_path)) | ||||
|     html_code = html_file_path.read_text() | ||||
| 
 | ||||
|     predicted_elements = partition_html( | ||||
|         text=html_code, html_parser_version="v2", unique_element_ids=True | ||||
|     ) | ||||
| 
 | ||||
|     assert len(expected_json_elements) == len(predicted_elements) | ||||
| 
 | ||||
|     for i in range(len(expected_json_elements)): | ||||
|  | ||||
| @ -0,0 +1,62 @@ | ||||
| [ | ||||
|     { | ||||
|         "element_id": "3a6b156a81764e17be128264241f8136", | ||||
|         "metadata": { | ||||
|             "category_depth": 0, | ||||
|             "filetype": "text/html", | ||||
|             "languages": [ | ||||
|                 "eng" | ||||
|             ], | ||||
|             "page_number": 1, | ||||
|             "parent_id": "897a8a47377c4ad6aab839a929879537", | ||||
|             "text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />" | ||||
|         }, | ||||
|         "text": "", | ||||
|         "type": "UncategorizedText" | ||||
|     }, | ||||
|     { | ||||
|         "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", | ||||
|         "metadata": { | ||||
|             "category_depth": 1, | ||||
|             "filetype": "text/html", | ||||
|             "languages": [ | ||||
|                 "eng" | ||||
|             ], | ||||
|             "page_number": 1, | ||||
|             "parent_id": "3a6b156a81764e17be128264241f8136", | ||||
|             "text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />" | ||||
|         }, | ||||
|         "text": "", | ||||
|         "type": "UncategorizedText" | ||||
|     }, | ||||
|     { | ||||
|         "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f", | ||||
|         "metadata": { | ||||
|             "category_depth": 2, | ||||
|             "filetype": "text/html", | ||||
|             "languages": [ | ||||
|                 "eng" | ||||
|             ], | ||||
|             "page_number": 1, | ||||
|             "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", | ||||
|             "text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />" | ||||
|         }, | ||||
|         "text": "New York logo", | ||||
|         "type": "Image" | ||||
|     }, | ||||
|     { | ||||
|         "element_id": "40c32fd8-9a02-42b8-a587-884293881090", | ||||
|         "metadata": { | ||||
|             "category_depth": 2, | ||||
|             "filetype": "text/html", | ||||
|             "languages": [ | ||||
|                 "eng" | ||||
|             ], | ||||
|             "page_number": 1, | ||||
|             "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", | ||||
|             "text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />" | ||||
|         }, | ||||
|         "text": "A line graph showing the comparison of 5 year cumulative total return for stocks", | ||||
|         "type": "Image" | ||||
|     } | ||||
| ] | ||||
| @ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs(): | ||||
|     assert len(unstructured_elements) == 2 | ||||
|     assert isinstance(unstructured_elements[0], Text) | ||||
|     assert isinstance(unstructured_elements[1], NarrativeText) | ||||
| 
 | ||||
| 
 | ||||
| def test_alternate_text_from_image_is_passed(): | ||||
|     # language=HTML | ||||
|     input_html = """ | ||||
|     <div class="Page"> | ||||
|     <table> | ||||
|         <tr> | ||||
|             <td rowspan="2">Example image nested in the table:</td> | ||||
|             <td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td> | ||||
|         </tr> | ||||
|     </table> | ||||
|     </div>add_img_alt_text | ||||
|     """ | ||||
|     page = parse_html_to_ontology(input_html) | ||||
|     unstructured_elements = ontology_to_unstructured_elements(page) | ||||
|     assert len(unstructured_elements) == 2 | ||||
|     assert "ALT TEXT Logo" in unstructured_elements[1].text | ||||
|  | ||||
							
								
								
									
										47
									
								
								test_unstructured/partition/html/test_partition_v2.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								test_unstructured/partition/html/test_partition_v2.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,47 @@ | ||||
| from unstructured.partition.html import partition_html | ||||
| 
 | ||||
| 
 | ||||
| def test_alternative_image_text_can_be_included(): | ||||
|     # language=HTML | ||||
|     html = """ | ||||
|     <div class="Page"> | ||||
|         <img src="my-logo.png" alt="ALT TEXT Logo"/> | ||||
|     </div> | ||||
|     """ | ||||
|     _, image_to_text_alt_mode = partition_html( | ||||
|         text=html, | ||||
|         image_alt_mode="to_text", | ||||
|         html_parser_version="v2", | ||||
|     ) | ||||
|     assert "ALT TEXT Logo" in image_to_text_alt_mode.text | ||||
| 
 | ||||
|     _, image_none_alt_mode = partition_html( | ||||
|         text=html, | ||||
|         image_alt_mode=None, | ||||
|         html_parser_version="v2", | ||||
|     ) | ||||
|     assert "ALT TEXT Logo" not in image_none_alt_mode.text | ||||
| 
 | ||||
| 
 | ||||
| def test_alternative_image_text_can_be_included_when_nested_in_paragraph(): | ||||
|     # language=HTML | ||||
|     html = """ | ||||
|     <div class="Page"> | ||||
|         <p class="Paragraph"> | ||||
|             <img src="my-logo.png" alt="ALT TEXT Logo"/> | ||||
|         </p> | ||||
|     </div> | ||||
|     """ | ||||
|     _, paragraph_to_text_alt_mode = partition_html( | ||||
|         text=html, | ||||
|         image_alt_mode="to_text", | ||||
|         html_parser_version="v2", | ||||
|     ) | ||||
|     assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text | ||||
| 
 | ||||
|     _, paragraph_none_alt_mode = partition_html( | ||||
|         text=html, | ||||
|         image_alt_mode=None, | ||||
|         html_parser_version="v2", | ||||
|     ) | ||||
|     assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text | ||||
| @ -1 +1 @@ | ||||
| __version__ = "0.16.6"  # pragma: no cover | ||||
| __version__ = "0.16.7"  # pragma: no cover | ||||
|  | ||||
| @ -89,11 +89,27 @@ class OntologyElement(BaseModel): | ||||
| 
 | ||||
|         return result_html | ||||
| 
 | ||||
|     def to_text(self, add_children=True) -> str: | ||||
|     def to_text(self, add_children=True, add_img_alt_text=True) -> str: | ||||
|         """ | ||||
|         Returns the text representation of the element. | ||||
| 
 | ||||
|         Args: | ||||
|             add_children: If True, the text of the children will be included. | ||||
|                             Otherwise, element is represented as single self-closing tag. | ||||
|             add_img_alt_text: If True, the alt text of the image will be included. | ||||
|         """ | ||||
|         if self.children and add_children: | ||||
|             children_text = " ".join(child.to_text().strip() for child in self.children) | ||||
|             children_text = " ".join( | ||||
|                 child.to_text(add_children, add_img_alt_text).strip() for child in self.children | ||||
|             ) | ||||
|             return children_text | ||||
|         return BeautifulSoup(self.to_html(), "html.parser").get_text().strip() | ||||
| 
 | ||||
|         text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip() | ||||
| 
 | ||||
|         if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes: | ||||
|             text += f" {self.additional_attributes.get('alt', '')}" | ||||
| 
 | ||||
|         return text.strip() | ||||
| 
 | ||||
|     def _construct_attribute_string(self, attributes: dict) -> str: | ||||
|         return " ".join( | ||||
| @ -473,8 +489,8 @@ class FormFieldValue(OntologyElement): | ||||
|     elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) | ||||
|     allowed_tags: List[str] = Field(["input"], frozen=True) | ||||
| 
 | ||||
|     def to_text(self, add_children=True) -> str: | ||||
|         text = super().to_text() | ||||
|     def to_text(self, add_children=True, add_img_alt_text=True) -> str: | ||||
|         text = super().to_text(add_children, add_img_alt_text) | ||||
|         value = self.additional_attributes.get("value", "") | ||||
|         if not value: | ||||
|             return text | ||||
|  | ||||
| @ -36,6 +36,7 @@ def partition_html( | ||||
|     skip_headers_and_footers: bool = False, | ||||
|     detection_origin: Optional[str] = None, | ||||
|     html_parser_version: Literal["v1", "v2"] = "v1", | ||||
|     image_alt_mode: Optional[Literal["to_text"]] = "to_text", | ||||
|     **kwargs: Any, | ||||
| ) -> list[Element]: | ||||
|     """Partitions an HTML document into its constituent elements. | ||||
| @ -65,6 +66,9 @@ def partition_html( | ||||
|     html_parser_version (Literal['v1', 'v2']): | ||||
|         The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will | ||||
|         use the ontology schema to parse the HTML document. | ||||
| 
 | ||||
|     image_alt_mode (Literal['to_text']): | ||||
|         When set 'to_text', the v2 parser will include the alternative text of images in the output. | ||||
|     """ | ||||
|     # -- parser rejects an empty str, nip that edge-case in the bud here -- | ||||
|     if text is not None and text.strip() == "" and not file and not filename and not url: | ||||
| @ -81,6 +85,7 @@ def partition_html( | ||||
|         skip_headers_and_footers=skip_headers_and_footers, | ||||
|         detection_origin=detection_origin, | ||||
|         html_parser_version=html_parser_version, | ||||
|         image_alt_mode=image_alt_mode, | ||||
|     ) | ||||
| 
 | ||||
|     return list(_HtmlPartitioner.iter_elements(opts)) | ||||
| @ -102,6 +107,7 @@ class HtmlPartitionerOptions: | ||||
|         skip_headers_and_footers: bool, | ||||
|         detection_origin: str | None, | ||||
|         html_parser_version: Literal["v1", "v2"] = "v1", | ||||
|         image_alt_mode: Optional[Literal["to_text"]] = "to_text", | ||||
|     ): | ||||
|         self._file_path = file_path | ||||
|         self._file = file | ||||
| @ -113,6 +119,7 @@ class HtmlPartitionerOptions: | ||||
|         self._skip_headers_and_footers = skip_headers_and_footers | ||||
|         self._detection_origin = detection_origin | ||||
|         self._html_parser_version = html_parser_version | ||||
|         self._image_alt_mode = image_alt_mode | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def detection_origin(self) -> str | None: | ||||
| @ -172,6 +179,11 @@ class HtmlPartitionerOptions: | ||||
|         """When html_parser_version=='v2', HTML elements follow ontology schema.""" | ||||
|         return self._html_parser_version | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def add_img_alt_text(self) -> bool: | ||||
|         """When True, the alternative text of images is included in the output.""" | ||||
|         return self._image_alt_mode == "to_text" | ||||
| 
 | ||||
| 
 | ||||
| class _HtmlPartitioner: | ||||
|     """Partition HTML document into document-elements.""" | ||||
| @ -239,5 +251,7 @@ class _HtmlPartitioner: | ||||
|         """Convert an ontology elements represented in HTML to an ontology element.""" | ||||
|         html_text = self._opts.html_text | ||||
|         ontology = parse_html_to_ontology(html_text) | ||||
|         unstructured_elements = ontology_to_unstructured_elements(ontology) | ||||
|         unstructured_elements = ontology_to_unstructured_elements( | ||||
|             ontology, add_img_alt_text=self._opts.add_img_alt_text | ||||
|         ) | ||||
|         return unstructured_elements | ||||
|  | ||||
| @ -24,6 +24,7 @@ def ontology_to_unstructured_elements( | ||||
|     page_number: int = None, | ||||
|     depth: int = 0, | ||||
|     filename: str | None = None, | ||||
|     add_img_alt_text: bool = True, | ||||
| ) -> list[elements.Element]: | ||||
|     """ | ||||
|     Converts an OntologyElement object to a list of unstructured Element objects. | ||||
| @ -44,7 +45,9 @@ def ontology_to_unstructured_elements( | ||||
|         parent_id (str, optional): The ID of the parent element. Defaults to None. | ||||
|         page_number (int, optional): The page number of the element. Defaults to None. | ||||
|         depth (int, optional): The depth of the element in the hierarchy. Defaults to 0. | ||||
| 
 | ||||
|         filename (str, optional): The name of the file the element comes from. Defaults to None. | ||||
|         add_img_alt_text (bool): Whether to include the alternative text of images | ||||
|                                             in the output. Defaults to True. | ||||
|     Returns: | ||||
|         list[Element]: A list of unstructured Element objects. | ||||
|     """ | ||||
| @ -77,6 +80,7 @@ def ontology_to_unstructured_elements( | ||||
|                 page_number=page_number, | ||||
|                 depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1, | ||||
|                 filename=filename, | ||||
|                 add_img_alt_text=add_img_alt_text, | ||||
|             ) | ||||
|             children += child | ||||
| 
 | ||||
| @ -85,7 +89,7 @@ def ontology_to_unstructured_elements( | ||||
|     else: | ||||
|         element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] | ||||
|         html_code_of_ontology_element = ontology_element.to_html() | ||||
|         element_text = ontology_element.to_text() | ||||
|         element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text) | ||||
| 
 | ||||
|         unstructured_element = element_class( | ||||
|             text=element_text, | ||||
| @ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement: | ||||
|     Args: | ||||
|         html_code (str): The HTML code to be parsed. | ||||
|             Parsing HTML will start from <div class="Page">. | ||||
| 
 | ||||
|     Returns: | ||||
|         OntologyElement: The parsed Element object. | ||||
| 
 | ||||
| @ -352,7 +355,6 @@ def parse_html_to_ontology_element( | ||||
|     Args: | ||||
|         soup (Tag): The BeautifulSoup Tag object to be converted. | ||||
|         recursion_depth (int): Flag to control limit of recursion depth. | ||||
| 
 | ||||
|     Returns: | ||||
|         OntologyElement: The converted OntologyElement object. | ||||
|     """ | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pluto
						Pluto