mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-16 20:10:19 +00:00
Fix when parent id is none for first element in v2 notion: (#3752)
This commit is contained in:
parent
9835fe4d5b
commit
2417f8ed84
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
|||||||
|
## 0.16.3-dev1
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
* **V2 elements without first parent ID can be parsed**
|
||||||
|
|
||||||
|
|
||||||
## 0.16.2
|
## 0.16.2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -0,0 +1,33 @@
|
|||||||
|
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text
|
||||||
|
from unstructured.documents.ontology import Document, Page, Paragraph
|
||||||
|
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
|
||||||
|
|
||||||
|
|
||||||
|
def test_when_first_elements_does_not_have_id():
|
||||||
|
unstructured_elements = [
|
||||||
|
Text(
|
||||||
|
element_id="1",
|
||||||
|
text="",
|
||||||
|
metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'),
|
||||||
|
),
|
||||||
|
NarrativeText(
|
||||||
|
element_id="2",
|
||||||
|
text="Example text",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
ontology = unstructured_elements_to_ontology(unstructured_elements)
|
||||||
|
|
||||||
|
assert isinstance(ontology, Document)
|
||||||
|
|
||||||
|
assert len(ontology.children) == 1
|
||||||
|
page = ontology.children[0]
|
||||||
|
|
||||||
|
assert isinstance(page, Page)
|
||||||
|
assert len(page.children) == 1
|
||||||
|
paragraph = page.children[0]
|
||||||
|
|
||||||
|
assert isinstance(paragraph, Paragraph)
|
||||||
|
assert paragraph.text == "Example text"
|
@ -1 +1 @@
|
|||||||
__version__ = "0.16.2" # pragma: no cover
|
__version__ = "0.16.3-dev1" # pragma: no cover
|
||||||
|
@ -67,7 +67,11 @@ class OntologyElement(BaseModel):
|
|||||||
if self.html_tag_name == "":
|
if self.html_tag_name == "":
|
||||||
self.html_tag_name = self.allowed_tags[0]
|
self.html_tag_name = self.allowed_tags[0]
|
||||||
if "id" not in self.additional_attributes:
|
if "id" not in self.additional_attributes:
|
||||||
self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "")
|
self.additional_attributes["id"] = self.generate_unique_id()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def generate_unique_id() -> str:
|
||||||
|
return str(uuid.uuid4()).replace("-", "")
|
||||||
|
|
||||||
def to_html(self, add_children=True) -> str:
|
def to_html(self, add_children=True) -> str:
|
||||||
additional_attrs = copy(self.additional_attributes)
|
additional_attrs = copy(self.additional_attributes)
|
||||||
|
@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
|
|||||||
id_to_element_mapping = OrderedDict()
|
id_to_element_mapping = OrderedDict()
|
||||||
|
|
||||||
document_element_id = unstructured_elements[0].metadata.parent_id
|
document_element_id = unstructured_elements[0].metadata.parent_id
|
||||||
|
|
||||||
|
if document_element_id is None:
|
||||||
|
document_element_id = OntologyElement.generate_unique_id()
|
||||||
|
unstructured_elements[0].metadata.parent_id = document_element_id
|
||||||
|
|
||||||
id_to_element_mapping[document_element_id] = Document(
|
id_to_element_mapping[document_element_id] = Document(
|
||||||
additional_attributes={"id": document_element_id}
|
additional_attributes={"id": document_element_id}
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user