Fix when parent id is none for first element in v2 notion: (#3752)

This commit is contained in:
Pluto 2024-10-25 11:43:36 +02:00 committed by GitHub
parent 9835fe4d5b
commit 2417f8ed84
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 55 additions and 2 deletions

View File

@ -1,3 +1,14 @@
## 0.16.3-dev1
### Enhancements
### Features
### Fixes
* **V2 elements without first parent ID can be parsed**
## 0.16.2
### Enhancements

View File

@ -0,0 +1,33 @@
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text
from unstructured.documents.ontology import Document, Page, Paragraph
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
def test_when_first_elements_does_not_have_id():
unstructured_elements = [
Text(
element_id="1",
text="",
metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'),
),
NarrativeText(
element_id="2",
text="Example text",
metadata=ElementMetadata(
text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1"
),
),
]
ontology = unstructured_elements_to_ontology(unstructured_elements)
assert isinstance(ontology, Document)
assert len(ontology.children) == 1
page = ontology.children[0]
assert isinstance(page, Page)
assert len(page.children) == 1
paragraph = page.children[0]
assert isinstance(paragraph, Paragraph)
assert paragraph.text == "Example text"

View File

@ -1 +1 @@
__version__ = "0.16.2" # pragma: no cover
__version__ = "0.16.3-dev1" # pragma: no cover

View File

@ -67,7 +67,11 @@ class OntologyElement(BaseModel):
if self.html_tag_name == "":
self.html_tag_name = self.allowed_tags[0]
if "id" not in self.additional_attributes:
self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "")
self.additional_attributes["id"] = self.generate_unique_id()
@staticmethod
def generate_unique_id() -> str:
return str(uuid.uuid4()).replace("-", "")
def to_html(self, add_children=True) -> str:
additional_attrs = copy(self.additional_attributes)

View File

@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
id_to_element_mapping = OrderedDict()
document_element_id = unstructured_elements[0].metadata.parent_id
if document_element_id is None:
document_element_id = OntologyElement.generate_unique_id()
unstructured_elements[0].metadata.parent_id = document_element_id
id_to_element_mapping[document_element_id] = Document(
additional_attributes={"id": document_element_id}
)