mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-16 03:48:33 +00:00
Fix when parent id is none for first element in v2 notion: (#3752)
This commit is contained in:
parent
9835fe4d5b
commit
2417f8ed84
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
||||
## 0.16.3-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* **V2 elements without first parent ID can be parsed**
|
||||
|
||||
|
||||
## 0.16.2
|
||||
|
||||
### Enhancements
|
||||
|
@ -0,0 +1,33 @@
|
||||
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text
|
||||
from unstructured.documents.ontology import Document, Page, Paragraph
|
||||
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
|
||||
|
||||
|
||||
def test_when_first_elements_does_not_have_id():
|
||||
unstructured_elements = [
|
||||
Text(
|
||||
element_id="1",
|
||||
text="",
|
||||
metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'),
|
||||
),
|
||||
NarrativeText(
|
||||
element_id="2",
|
||||
text="Example text",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1"
|
||||
),
|
||||
),
|
||||
]
|
||||
ontology = unstructured_elements_to_ontology(unstructured_elements)
|
||||
|
||||
assert isinstance(ontology, Document)
|
||||
|
||||
assert len(ontology.children) == 1
|
||||
page = ontology.children[0]
|
||||
|
||||
assert isinstance(page, Page)
|
||||
assert len(page.children) == 1
|
||||
paragraph = page.children[0]
|
||||
|
||||
assert isinstance(paragraph, Paragraph)
|
||||
assert paragraph.text == "Example text"
|
@ -1 +1 @@
|
||||
__version__ = "0.16.2" # pragma: no cover
|
||||
__version__ = "0.16.3-dev1" # pragma: no cover
|
||||
|
@ -67,7 +67,11 @@ class OntologyElement(BaseModel):
|
||||
if self.html_tag_name == "":
|
||||
self.html_tag_name = self.allowed_tags[0]
|
||||
if "id" not in self.additional_attributes:
|
||||
self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "")
|
||||
self.additional_attributes["id"] = self.generate_unique_id()
|
||||
|
||||
@staticmethod
|
||||
def generate_unique_id() -> str:
|
||||
return str(uuid.uuid4()).replace("-", "")
|
||||
|
||||
def to_html(self, add_children=True) -> str:
|
||||
additional_attrs = copy(self.additional_attributes)
|
||||
|
@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
|
||||
id_to_element_mapping = OrderedDict()
|
||||
|
||||
document_element_id = unstructured_elements[0].metadata.parent_id
|
||||
|
||||
if document_element_id is None:
|
||||
document_element_id = OntologyElement.generate_unique_id()
|
||||
unstructured_elements[0].metadata.parent_id = document_element_id
|
||||
|
||||
id_to_element_mapping[document_element_id] = Document(
|
||||
additional_attributes={"id": document_element_id}
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user