mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	Fix when parent id is none for first element in v2 notion: (#3752)
This commit is contained in:
		
							parent
							
								
									9835fe4d5b
								
							
						
					
					
						commit
						2417f8ed84
					
				
							
								
								
									
										11
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								CHANGELOG.md
									
									
									
									
									
								
							| @ -1,3 +1,14 @@ | ||||
| ## 0.16.3-dev1 | ||||
| 
 | ||||
| ### Enhancements | ||||
| 
 | ||||
| ### Features | ||||
| 
 | ||||
| ### Fixes | ||||
| 
 | ||||
| * **V2 elements without first parent ID can be parsed** | ||||
| 
 | ||||
| 
 | ||||
| ## 0.16.2 | ||||
| 
 | ||||
| ### Enhancements | ||||
|  | ||||
| @ -0,0 +1,33 @@ | ||||
| from unstructured.documents.elements import ElementMetadata, NarrativeText, Text | ||||
| from unstructured.documents.ontology import Document, Page, Paragraph | ||||
| from unstructured.partition.html.transformations import unstructured_elements_to_ontology | ||||
| 
 | ||||
| 
 | ||||
| def test_when_first_elements_does_not_have_id(): | ||||
|     unstructured_elements = [ | ||||
|         Text( | ||||
|             element_id="1", | ||||
|             text="", | ||||
|             metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'), | ||||
|         ), | ||||
|         NarrativeText( | ||||
|             element_id="2", | ||||
|             text="Example text", | ||||
|             metadata=ElementMetadata( | ||||
|                 text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1" | ||||
|             ), | ||||
|         ), | ||||
|     ] | ||||
|     ontology = unstructured_elements_to_ontology(unstructured_elements) | ||||
| 
 | ||||
|     assert isinstance(ontology, Document) | ||||
| 
 | ||||
|     assert len(ontology.children) == 1 | ||||
|     page = ontology.children[0] | ||||
| 
 | ||||
|     assert isinstance(page, Page) | ||||
|     assert len(page.children) == 1 | ||||
|     paragraph = page.children[0] | ||||
| 
 | ||||
|     assert isinstance(paragraph, Paragraph) | ||||
|     assert paragraph.text == "Example text" | ||||
| @ -1 +1 @@ | ||||
| __version__ = "0.16.2"  # pragma: no cover | ||||
| __version__ = "0.16.3-dev1"  # pragma: no cover | ||||
|  | ||||
| @ -67,7 +67,11 @@ class OntologyElement(BaseModel): | ||||
|         if self.html_tag_name == "": | ||||
|             self.html_tag_name = self.allowed_tags[0] | ||||
|         if "id" not in self.additional_attributes: | ||||
|             self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "") | ||||
|             self.additional_attributes["id"] = self.generate_unique_id() | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def generate_unique_id() -> str: | ||||
|         return str(uuid.uuid4()).replace("-", "") | ||||
| 
 | ||||
|     def to_html(self, add_children=True) -> str: | ||||
|         additional_attrs = copy(self.additional_attributes) | ||||
|  | ||||
| @ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) | ||||
|     id_to_element_mapping = OrderedDict() | ||||
| 
 | ||||
|     document_element_id = unstructured_elements[0].metadata.parent_id | ||||
| 
 | ||||
|     if document_element_id is None: | ||||
|         document_element_id = OntologyElement.generate_unique_id() | ||||
|         unstructured_elements[0].metadata.parent_id = document_element_id | ||||
| 
 | ||||
|     id_to_element_mapping[document_element_id] = Document( | ||||
|         additional_attributes={"id": document_element_id} | ||||
|     ) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pluto
						Pluto