mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-30 08:43:44 +00:00
Set <table> to be ontology.Table not UncategorizedText (#3782)
This commit is contained in:
parent
a6aefee0cb
commit
ca27b8aa97
@ -1,3 +1,12 @@
|
||||
## 0.16.6-dev0
|
||||
|
||||
### Enhancements
|
||||
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.16.5
|
||||
|
||||
### Enhancements
|
||||
|
||||
53
test_unstructured/documents/test_mappings.py
Normal file
53
test_unstructured/documents/test_mappings.py
Normal file
@ -0,0 +1,53 @@
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Type
|
||||
|
||||
from unstructured.documents import elements, ontology
|
||||
from unstructured.documents.mappings import (
|
||||
ALL_ONTOLOGY_ELEMENT_TYPES,
|
||||
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
|
||||
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
|
||||
get_all_subclasses,
|
||||
)
|
||||
from unstructured.documents.ontology import OntologyElement
|
||||
|
||||
|
||||
def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
|
||||
"""
|
||||
Get a mapping of HTML tags to their exclusive OntologyElement types.
|
||||
"""
|
||||
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
|
||||
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
|
||||
for tag in element_type().allowed_tags:
|
||||
html_tag_to_element_type_mappings[tag].append(element_type)
|
||||
|
||||
return {
|
||||
tag: element_types[0]
|
||||
for tag, element_types in html_tag_to_element_type_mappings.items()
|
||||
if len(element_types) == 1
|
||||
}
|
||||
|
||||
|
||||
def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
|
||||
exclusive_html_tags = _get_exclusive_html_tags()
|
||||
for expected_tag, expected_element_type in exclusive_html_tags.items():
|
||||
assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
|
||||
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type
|
||||
|
||||
|
||||
def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
|
||||
for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values():
|
||||
assert issubclass(element_type, OntologyElement)
|
||||
|
||||
|
||||
def test_ontology_to_unstructured_mapping_has_valid_types():
|
||||
for (
|
||||
ontology_element,
|
||||
unstructured_element,
|
||||
) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items():
|
||||
assert issubclass(unstructured_element, elements.Element)
|
||||
assert issubclass(ontology_element, ontology.OntologyElement)
|
||||
|
||||
|
||||
def test_all_ontology_elements_are_defined_in_mapping_to_unstructured():
|
||||
for ontology_element in get_all_subclasses(ontology.OntologyElement):
|
||||
assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.5" # pragma: no cover
|
||||
__version__ = "0.16.6-dev0" # pragma: no cover
|
||||
|
||||
@ -5,10 +5,10 @@ They are used to simplify transformations between different representations
|
||||
of parsed documents
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, Type
|
||||
|
||||
from unstructured.documents.ontology import OntologyElement
|
||||
from unstructured.documents import elements, ontology
|
||||
from unstructured.documents.elements import Element
|
||||
|
||||
|
||||
def get_all_subclasses(cls) -> list[Any]:
|
||||
@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]:
|
||||
return all_subclasses
|
||||
|
||||
|
||||
def get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
|
||||
def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
|
||||
"""
|
||||
Get a mapping of HTML tags to their exclusive OntologyElement types.
|
||||
"""
|
||||
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
|
||||
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
|
||||
for tag in element_type().allowed_tags:
|
||||
html_tag_to_element_type_mappings[tag].append(element_type)
|
||||
|
||||
return {
|
||||
tag: element_types[0]
|
||||
for tag, element_types in html_tag_to_element_type_mappings.items()
|
||||
if len(element_types) == 1
|
||||
}
|
||||
|
||||
|
||||
def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
|
||||
"""
|
||||
Get a mapping of ontology element names to unstructured type names.
|
||||
Get a mapping of ontology element to unstructured type.
|
||||
|
||||
The dictionary here was created base on ontology mapping json
|
||||
Can be generated via the following code:
|
||||
@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
|
||||
```
|
||||
|
||||
Returns:
|
||||
dict: A dictionary where keys are ontology element class names
|
||||
and values are unstructured type names.
|
||||
dict: A dictionary where keys are ontology element classes
|
||||
and values are unstructured types.
|
||||
"""
|
||||
ontology_to_unstructured_class_mapping = {
|
||||
"Document": "UncategorizedText",
|
||||
"Section": "UncategorizedText",
|
||||
"Page": "UncategorizedText",
|
||||
"Column": "UncategorizedText",
|
||||
"Paragraph": "NarrativeText",
|
||||
"Header": "Header",
|
||||
"Footer": "Footer",
|
||||
"Sidebar": "UncategorizedText",
|
||||
"PageBreak": "PageBreak",
|
||||
"Title": "Title",
|
||||
"Subtitle": "Title",
|
||||
"Heading": "Title",
|
||||
"NarrativeText": "NarrativeText",
|
||||
"Quote": "NarrativeText",
|
||||
"Footnote": "UncategorizedText",
|
||||
"Caption": "FigureCaption",
|
||||
"PageNumber": "PageNumber",
|
||||
"UncategorizedText": "UncategorizedText",
|
||||
"OrderedList": "UncategorizedText",
|
||||
"UnorderedList": "UncategorizedText",
|
||||
"DefinitionList": "UncategorizedText",
|
||||
"ListItem": "ListItem",
|
||||
"Table": "Table",
|
||||
"TableRow": "Table",
|
||||
"TableCell": "Table",
|
||||
"TableCellHeader": "Table",
|
||||
"TableBody": "Table",
|
||||
"TableHeader": "Table",
|
||||
"Image": "Image",
|
||||
"Figure": "Image",
|
||||
"Video": "UncategorizedText",
|
||||
"Audio": "UncategorizedText",
|
||||
"Barcode": "Image",
|
||||
"QRCode": "Image",
|
||||
"Logo": "Image",
|
||||
"CodeBlock": "CodeSnippet",
|
||||
"InlineCode": "CodeSnippet",
|
||||
"Formula": "Formula",
|
||||
"Equation": "Formula",
|
||||
"FootnoteReference": "UncategorizedText",
|
||||
"Citation": "UncategorizedText",
|
||||
"Bibliography": "UncategorizedText",
|
||||
"Glossary": "UncategorizedText",
|
||||
"Author": "UncategorizedText",
|
||||
"MetaDate": "UncategorizedText",
|
||||
"Keywords": "UncategorizedText",
|
||||
"Abstract": "NarrativeText",
|
||||
"Hyperlink": "UncategorizedText",
|
||||
"TableOfContents": "UncategorizedText",
|
||||
"Index": "UncategorizedText",
|
||||
"Form": "UncategorizedText",
|
||||
"FormField": "UncategorizedText",
|
||||
"FormFieldValue": "UncategorizedText",
|
||||
"Checkbox": "UncategorizedText",
|
||||
"RadioButton": "UncategorizedText",
|
||||
"Button": "UncategorizedText",
|
||||
"Comment": "UncategorizedText",
|
||||
"Highlight": "UncategorizedText",
|
||||
"RevisionInsertion": "UncategorizedText",
|
||||
"RevisionDeletion": "UncategorizedText",
|
||||
"Address": "Address",
|
||||
"EmailAddress": "EmailAddress",
|
||||
"PhoneNumber": "UncategorizedText",
|
||||
"CalendarDate": "UncategorizedText",
|
||||
"Time": "UncategorizedText",
|
||||
"Currency": "UncategorizedText",
|
||||
"Measurement": "UncategorizedText",
|
||||
"Letterhead": "Header",
|
||||
"Signature": "UncategorizedText",
|
||||
"Watermark": "UncategorizedText",
|
||||
"Stamp": "UncategorizedText",
|
||||
ontology.Document: elements.Text,
|
||||
ontology.Section: elements.Text,
|
||||
ontology.Page: elements.Text,
|
||||
ontology.Column: elements.Text,
|
||||
ontology.Paragraph: elements.NarrativeText,
|
||||
ontology.Header: elements.Header,
|
||||
ontology.Footer: elements.Footer,
|
||||
ontology.Sidebar: elements.Text,
|
||||
ontology.PageBreak: elements.PageBreak,
|
||||
ontology.Title: elements.Title,
|
||||
ontology.Subtitle: elements.Title,
|
||||
ontology.Heading: elements.Title,
|
||||
ontology.NarrativeText: elements.NarrativeText,
|
||||
ontology.Quote: elements.NarrativeText,
|
||||
ontology.Footnote: elements.Text,
|
||||
ontology.Caption: elements.FigureCaption,
|
||||
ontology.PageNumber: elements.PageNumber,
|
||||
ontology.UncategorizedText: elements.Text,
|
||||
ontology.OrderedList: elements.Text,
|
||||
ontology.UnorderedList: elements.Text,
|
||||
ontology.DefinitionList: elements.Text,
|
||||
ontology.ListItem: elements.ListItem,
|
||||
ontology.Table: elements.Table,
|
||||
ontology.TableRow: elements.Table,
|
||||
ontology.TableCell: elements.Table,
|
||||
ontology.TableCellHeader: elements.Table,
|
||||
ontology.TableBody: elements.Table,
|
||||
ontology.TableHeader: elements.Table,
|
||||
ontology.Image: elements.Image,
|
||||
ontology.Figure: elements.Image,
|
||||
ontology.Video: elements.Text,
|
||||
ontology.Audio: elements.Text,
|
||||
ontology.Barcode: elements.Image,
|
||||
ontology.QRCode: elements.Image,
|
||||
ontology.Logo: elements.Image,
|
||||
ontology.CodeBlock: elements.CodeSnippet,
|
||||
ontology.InlineCode: elements.CodeSnippet,
|
||||
ontology.Formula: elements.Formula,
|
||||
ontology.Equation: elements.Formula,
|
||||
ontology.FootnoteReference: elements.Text,
|
||||
ontology.Citation: elements.Text,
|
||||
ontology.Bibliography: elements.Text,
|
||||
ontology.Glossary: elements.Text,
|
||||
ontology.Author: elements.Text,
|
||||
ontology.MetaDate: elements.Text,
|
||||
ontology.Keywords: elements.Text,
|
||||
ontology.Abstract: elements.NarrativeText,
|
||||
ontology.Hyperlink: elements.Text,
|
||||
ontology.TableOfContents: elements.Text,
|
||||
ontology.Index: elements.Text,
|
||||
ontology.Form: elements.Text,
|
||||
ontology.FormField: elements.Text,
|
||||
ontology.FormFieldValue: elements.Text,
|
||||
ontology.Checkbox: elements.Text,
|
||||
ontology.RadioButton: elements.Text,
|
||||
ontology.Button: elements.Text,
|
||||
ontology.Comment: elements.Text,
|
||||
ontology.Highlight: elements.Text,
|
||||
ontology.RevisionInsertion: elements.Text,
|
||||
ontology.RevisionDeletion: elements.Text,
|
||||
ontology.Address: elements.Address,
|
||||
ontology.EmailAddress: elements.EmailAddress,
|
||||
ontology.PhoneNumber: elements.Text,
|
||||
ontology.CalendarDate: elements.Text,
|
||||
ontology.Time: elements.Text,
|
||||
ontology.Currency: elements.Text,
|
||||
ontology.Measurement: elements.Text,
|
||||
ontology.Letterhead: elements.Header,
|
||||
ontology.Signature: elements.Text,
|
||||
ontology.Watermark: elements.Text,
|
||||
ontology.Stamp: elements.Text,
|
||||
}
|
||||
|
||||
return ontology_to_unstructured_class_mapping
|
||||
|
||||
|
||||
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(OntologyElement)
|
||||
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[OntologyElement]] = {
|
||||
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(ontology.OntologyElement)
|
||||
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[ontology.OntologyElement]] = {
|
||||
(tag, element_type().css_class_name): element_type
|
||||
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
|
||||
for tag in element_type().allowed_tags
|
||||
}
|
||||
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = {
|
||||
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
|
||||
element_type().css_class_name: element_type
|
||||
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
|
||||
for tag in element_type().allowed_tags
|
||||
}
|
||||
|
||||
EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = get_exclusive_html_tags()
|
||||
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping()
|
||||
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
|
||||
"body": ontology.Document,
|
||||
"footer": ontology.Footer,
|
||||
"aside": ontology.Sidebar,
|
||||
"hr": ontology.PageBreak,
|
||||
"h3": ontology.Heading,
|
||||
"h4": ontology.Heading,
|
||||
"h5": ontology.Heading,
|
||||
"h6": ontology.Heading,
|
||||
"blockquote": ontology.Quote,
|
||||
"figcaption": ontology.Caption,
|
||||
"ol": ontology.OrderedList,
|
||||
"li": ontology.ListItem,
|
||||
"tbody": ontology.TableBody,
|
||||
"thead": ontology.TableHeader,
|
||||
"tr": ontology.TableRow,
|
||||
"td": ontology.TableCell,
|
||||
"th": ontology.TableCellHeader,
|
||||
"figure": ontology.Figure,
|
||||
"video": ontology.Video,
|
||||
"audio": ontology.Audio,
|
||||
"pre": ontology.CodeBlock,
|
||||
"sub": ontology.FootnoteReference,
|
||||
"cite": ontology.Citation,
|
||||
"nav": ontology.Index,
|
||||
"form": ontology.Form,
|
||||
"label": ontology.FormField,
|
||||
"button": ontology.Button,
|
||||
"mark": ontology.Highlight,
|
||||
"ins": ontology.RevisionInsertion,
|
||||
"del": ontology.RevisionDeletion,
|
||||
"address": ontology.Address,
|
||||
"table": ontology.Table,
|
||||
}
|
||||
|
||||
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()
|
||||
|
||||
@ -7,45 +7,24 @@ from typing import Sequence, Type
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
TYPE_TO_TEXT_ELEMENT_MAP,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Text,
|
||||
)
|
||||
from unstructured.documents import elements, ontology
|
||||
from unstructured.documents.mappings import (
|
||||
CSS_CLASS_TO_ELEMENT_TYPE_MAP,
|
||||
EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP,
|
||||
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP,
|
||||
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME,
|
||||
)
|
||||
from unstructured.documents.ontology import (
|
||||
Bibliography,
|
||||
Citation,
|
||||
Document,
|
||||
ElementTypeEnum,
|
||||
Footnote,
|
||||
FootnoteReference,
|
||||
Glossary,
|
||||
Hyperlink,
|
||||
NarrativeText,
|
||||
OntologyElement,
|
||||
Page,
|
||||
Paragraph,
|
||||
Quote,
|
||||
UncategorizedText,
|
||||
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
|
||||
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
|
||||
)
|
||||
|
||||
RECURSION_LIMIT = 50
|
||||
|
||||
|
||||
def ontology_to_unstructured_elements(
|
||||
ontology_element: OntologyElement,
|
||||
ontology_element: ontology.OntologyElement,
|
||||
parent_id: str = None,
|
||||
page_number: int = None,
|
||||
depth: int = 0,
|
||||
filename: str | None = None,
|
||||
) -> list[Element]:
|
||||
) -> list[elements.Element]:
|
||||
"""
|
||||
Converts an OntologyElement object to a list of unstructured Element objects.
|
||||
|
||||
@ -70,18 +49,18 @@ def ontology_to_unstructured_elements(
|
||||
list[Element]: A list of unstructured Element objects.
|
||||
"""
|
||||
elements_to_return = []
|
||||
if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
|
||||
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
|
||||
|
||||
if page_number is None and isinstance(ontology_element, Page):
|
||||
if page_number is None and isinstance(ontology_element, ontology.Page):
|
||||
page_number = ontology_element.page_number
|
||||
|
||||
if not isinstance(ontology_element, Document):
|
||||
if not isinstance(ontology_element, ontology.Document):
|
||||
elements_to_return += [
|
||||
Text(
|
||||
elements.Text(
|
||||
text="",
|
||||
element_id=ontology_element.id,
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
metadata=elements.ElementMetadata(
|
||||
parent_id=parent_id,
|
||||
text_as_html=ontology_element.to_html(add_children=False),
|
||||
page_number=page_number,
|
||||
@ -96,7 +75,7 @@ def ontology_to_unstructured_elements(
|
||||
child,
|
||||
parent_id=ontology_element.id,
|
||||
page_number=page_number,
|
||||
depth=0 if isinstance(ontology_element, Document) else depth + 1,
|
||||
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
|
||||
filename=filename,
|
||||
)
|
||||
children += child
|
||||
@ -104,10 +83,7 @@ def ontology_to_unstructured_elements(
|
||||
combined_children = combine_inline_elements(children)
|
||||
elements_to_return += combined_children
|
||||
else:
|
||||
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
|
||||
ontology_element.__class__.__name__
|
||||
]
|
||||
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
|
||||
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
|
||||
html_code_of_ontology_element = ontology_element.to_html()
|
||||
element_text = ontology_element.to_text()
|
||||
|
||||
@ -115,7 +91,7 @@ def ontology_to_unstructured_elements(
|
||||
text=element_text,
|
||||
element_id=ontology_element.id,
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
metadata=elements.ElementMetadata(
|
||||
parent_id=parent_id,
|
||||
text_as_html=html_code_of_ontology_element,
|
||||
page_number=page_number,
|
||||
@ -128,7 +104,7 @@ def ontology_to_unstructured_elements(
|
||||
return elements_to_return
|
||||
|
||||
|
||||
def combine_inline_elements(elements: list[Element]) -> list[Element]:
|
||||
def combine_inline_elements(elements: list[elements.Element]) -> list[elements.Element]:
|
||||
"""
|
||||
Combines consecutive inline elements into a single element. Inline elements
|
||||
can be also combined with text elements.
|
||||
@ -168,7 +144,9 @@ def combine_inline_elements(elements: list[Element]) -> list[Element]:
|
||||
return result_elements
|
||||
|
||||
|
||||
def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool:
|
||||
def can_unstructured_elements_be_merged(
|
||||
current_element: elements.Element, next_element: elements.Element
|
||||
) -> bool:
|
||||
"""
|
||||
Elements can be merged when:
|
||||
- They are on the same level in the HTML tree
|
||||
@ -200,20 +178,20 @@ def can_unstructured_elements_be_merged(current_element: Element, next_element:
|
||||
return True
|
||||
|
||||
|
||||
def is_text_element(ontology_element: OntologyElement) -> bool:
|
||||
def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
|
||||
"""Categories or classes that we want to combine with inline text"""
|
||||
|
||||
text_classes = [
|
||||
NarrativeText,
|
||||
Quote,
|
||||
Paragraph,
|
||||
Footnote,
|
||||
FootnoteReference,
|
||||
Citation,
|
||||
Bibliography,
|
||||
Glossary,
|
||||
ontology.NarrativeText,
|
||||
ontology.Quote,
|
||||
ontology.Paragraph,
|
||||
ontology.Footnote,
|
||||
ontology.FootnoteReference,
|
||||
ontology.Citation,
|
||||
ontology.Bibliography,
|
||||
ontology.Glossary,
|
||||
]
|
||||
text_categories = [ElementTypeEnum.metadata]
|
||||
text_categories = [ontology.ElementTypeEnum.metadata]
|
||||
|
||||
if any(isinstance(ontology_element, class_) for class_ in text_classes):
|
||||
return True
|
||||
@ -224,11 +202,14 @@ def is_text_element(ontology_element: OntologyElement) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def is_inline_element(ontology_element: OntologyElement) -> bool:
|
||||
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
|
||||
"""Categories or classes that we want to combine with text elements"""
|
||||
|
||||
inline_classes = [Hyperlink]
|
||||
inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation]
|
||||
inline_classes = [ontology.Hyperlink]
|
||||
inline_categories = [
|
||||
ontology.ElementTypeEnum.specialized_text,
|
||||
ontology.ElementTypeEnum.annotation,
|
||||
]
|
||||
|
||||
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
|
||||
return True
|
||||
@ -239,7 +220,9 @@ def is_inline_element(ontology_element: OntologyElement) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement:
|
||||
def unstructured_elements_to_ontology(
|
||||
unstructured_elements: Sequence[elements.Element],
|
||||
) -> ontology.OntologyElement:
|
||||
"""
|
||||
Converts a sequence of unstructured Element objects to an OntologyElement object.
|
||||
|
||||
@ -260,10 +243,10 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
|
||||
document_element_id = unstructured_elements[0].metadata.parent_id
|
||||
|
||||
if document_element_id is None:
|
||||
document_element_id = OntologyElement.generate_unique_id()
|
||||
document_element_id = ontology.OntologyElement.generate_unique_id()
|
||||
unstructured_elements[0].metadata.parent_id = document_element_id
|
||||
|
||||
id_to_element_mapping[document_element_id] = Document(
|
||||
id_to_element_mapping[document_element_id] = ontology.Document(
|
||||
additional_attributes={"id": document_element_id}
|
||||
)
|
||||
|
||||
@ -288,7 +271,7 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
|
||||
return root_element
|
||||
|
||||
|
||||
def parse_html_to_ontology(html_code: str) -> OntologyElement:
|
||||
def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
|
||||
"""
|
||||
Parses the given HTML code and converts it into an Element object.
|
||||
|
||||
@ -356,7 +339,9 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
|
||||
return str(soup)
|
||||
|
||||
|
||||
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
|
||||
def parse_html_to_ontology_element(
|
||||
soup: Tag, recursion_depth: int = 1
|
||||
) -> ontology.OntologyElement | None:
|
||||
"""
|
||||
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
|
||||
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
|
||||
@ -375,7 +360,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
|
||||
escaped_attrs = get_escaped_attributes(soup)
|
||||
|
||||
if soup.name == "br": # Note(Pluto) should it be <br class="UncategorizedText">?
|
||||
return Paragraph(
|
||||
return ontology.Paragraph(
|
||||
text="",
|
||||
css_class_name=None,
|
||||
html_tag_name="br",
|
||||
@ -383,9 +368,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
|
||||
)
|
||||
|
||||
has_children = (
|
||||
(ontology_class != UncategorizedText)
|
||||
(ontology_class != ontology.UncategorizedText)
|
||||
and any(isinstance(content, Tag) for content in soup.contents)
|
||||
or ontology_class().elementType == ElementTypeEnum.layout
|
||||
or ontology_class().elementType == ontology.ElementTypeEnum.layout
|
||||
)
|
||||
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
|
||||
|
||||
@ -395,7 +380,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
|
||||
(
|
||||
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
|
||||
if isinstance(child, Tag)
|
||||
else Paragraph(text=str(child).strip())
|
||||
else ontology.Paragraph(text=str(child).strip())
|
||||
)
|
||||
for child in soup.children
|
||||
if str(child).strip()
|
||||
@ -414,7 +399,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
|
||||
return output_element
|
||||
|
||||
|
||||
def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[OntologyElement]]:
|
||||
def extract_tag_and_ontology_class_from_tag(
|
||||
soup: Tag,
|
||||
) -> tuple[str, Type[ontology.OntologyElement]]:
|
||||
"""
|
||||
Extracts the HTML tag and corresponding ontology class
|
||||
from a BeautifulSoup Tag object. The CSS class is prioritized over
|
||||
@ -445,8 +432,8 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo
|
||||
html_tag = element_class().allowed_tags[0]
|
||||
|
||||
# Scenario 3: CSS class incorrect, but HTML tag correct and exclusive in ontology
|
||||
if not element_class and soup.name in EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP:
|
||||
html_tag, element_class = soup.name, EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP[soup.name]
|
||||
if not element_class and soup.name in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP:
|
||||
html_tag, element_class = soup.name, HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[soup.name]
|
||||
|
||||
# Scenario 4: CSS class incorrect, HTML tag incorrect
|
||||
# Fallback to default UncategorizedText
|
||||
@ -455,7 +442,7 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo
|
||||
# e.g. parent=FormField soup.name=input -> element=FormFieldInput
|
||||
|
||||
html_tag = "span"
|
||||
element_class = UncategorizedText
|
||||
element_class = ontology.UncategorizedText
|
||||
|
||||
return html_tag, element_class
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user