mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	Fix when parent id is none for first element in v2 notion: (#3752)
This commit is contained in:
		
							parent
							
								
									9835fe4d5b
								
							
						
					
					
						commit
						2417f8ed84
					
				
							
								
								
									
										11
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								CHANGELOG.md
									
									
									
									
									
								
							@ -1,3 +1,14 @@
 | 
			
		||||
## 0.16.3-dev1
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
### Features
 | 
			
		||||
 | 
			
		||||
### Fixes
 | 
			
		||||
 | 
			
		||||
* **V2 elements without first parent ID can be parsed**
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 0.16.2
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
@ -0,0 +1,33 @@
 | 
			
		||||
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text
 | 
			
		||||
from unstructured.documents.ontology import Document, Page, Paragraph
 | 
			
		||||
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_when_first_elements_does_not_have_id():
 | 
			
		||||
    unstructured_elements = [
 | 
			
		||||
        Text(
 | 
			
		||||
            element_id="1",
 | 
			
		||||
            text="",
 | 
			
		||||
            metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'),
 | 
			
		||||
        ),
 | 
			
		||||
        NarrativeText(
 | 
			
		||||
            element_id="2",
 | 
			
		||||
            text="Example text",
 | 
			
		||||
            metadata=ElementMetadata(
 | 
			
		||||
                text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1"
 | 
			
		||||
            ),
 | 
			
		||||
        ),
 | 
			
		||||
    ]
 | 
			
		||||
    ontology = unstructured_elements_to_ontology(unstructured_elements)
 | 
			
		||||
 | 
			
		||||
    assert isinstance(ontology, Document)
 | 
			
		||||
 | 
			
		||||
    assert len(ontology.children) == 1
 | 
			
		||||
    page = ontology.children[0]
 | 
			
		||||
 | 
			
		||||
    assert isinstance(page, Page)
 | 
			
		||||
    assert len(page.children) == 1
 | 
			
		||||
    paragraph = page.children[0]
 | 
			
		||||
 | 
			
		||||
    assert isinstance(paragraph, Paragraph)
 | 
			
		||||
    assert paragraph.text == "Example text"
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.16.2"  # pragma: no cover
 | 
			
		||||
__version__ = "0.16.3-dev1"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -67,7 +67,11 @@ class OntologyElement(BaseModel):
 | 
			
		||||
        if self.html_tag_name == "":
 | 
			
		||||
            self.html_tag_name = self.allowed_tags[0]
 | 
			
		||||
        if "id" not in self.additional_attributes:
 | 
			
		||||
            self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "")
 | 
			
		||||
            self.additional_attributes["id"] = self.generate_unique_id()
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def generate_unique_id() -> str:
 | 
			
		||||
        return str(uuid.uuid4()).replace("-", "")
 | 
			
		||||
 | 
			
		||||
    def to_html(self, add_children=True) -> str:
 | 
			
		||||
        additional_attrs = copy(self.additional_attributes)
 | 
			
		||||
 | 
			
		||||
@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
 | 
			
		||||
    id_to_element_mapping = OrderedDict()
 | 
			
		||||
 | 
			
		||||
    document_element_id = unstructured_elements[0].metadata.parent_id
 | 
			
		||||
 | 
			
		||||
    if document_element_id is None:
 | 
			
		||||
        document_element_id = OntologyElement.generate_unique_id()
 | 
			
		||||
        unstructured_elements[0].metadata.parent_id = document_element_id
 | 
			
		||||
 | 
			
		||||
    id_to_element_mapping[document_element_id] = Document(
 | 
			
		||||
        additional_attributes={"id": document_element_id}
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user