Set <table> to be ontology.Table not UncategorizedText (#3782)

This commit is contained in:
Pluto 2024-11-15 15:30:48 +01:00 committed by GitHub
parent a6aefee0cb
commit ca27b8aa97
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 231 additions and 164 deletions

View File

@ -1,3 +1,12 @@
## 0.16.6-dev0
### Enhancements
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.
### Features
### Fixes
## 0.16.5
### Enhancements

View File

@ -0,0 +1,53 @@
from collections import defaultdict
from typing import Dict, Type
from unstructured.documents import elements, ontology
from unstructured.documents.mappings import (
ALL_ONTOLOGY_ELEMENT_TYPES,
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
get_all_subclasses,
)
from unstructured.documents.ontology import OntologyElement
def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
"""
Get a mapping of HTML tags to their exclusive OntologyElement types.
"""
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
for tag in element_type().allowed_tags:
html_tag_to_element_type_mappings[tag].append(element_type)
return {
tag: element_types[0]
for tag, element_types in html_tag_to_element_type_mappings.items()
if len(element_types) == 1
}
def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
exclusive_html_tags = _get_exclusive_html_tags()
for expected_tag, expected_element_type in exclusive_html_tags.items():
assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type
def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values():
assert issubclass(element_type, OntologyElement)
def test_ontology_to_unstructured_mapping_has_valid_types():
for (
ontology_element,
unstructured_element,
) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items():
assert issubclass(unstructured_element, elements.Element)
assert issubclass(ontology_element, ontology.OntologyElement)
def test_all_ontology_elements_are_defined_in_mapping_to_unstructured():
for ontology_element in get_all_subclasses(ontology.OntologyElement):
assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE

View File

@ -1 +1 @@
__version__ = "0.16.5" # pragma: no cover
__version__ = "0.16.6-dev0" # pragma: no cover

View File

@ -5,10 +5,10 @@ They are used to simplify transformations between different representations
of parsed documents
"""
from collections import defaultdict
from typing import Any, Dict, Type
from unstructured.documents.ontology import OntologyElement
from unstructured.documents import elements, ontology
from unstructured.documents.elements import Element
def get_all_subclasses(cls) -> list[Any]:
@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]:
return all_subclasses
def get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
"""
Get a mapping of HTML tags to their exclusive OntologyElement types.
"""
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
for tag in element_type().allowed_tags:
html_tag_to_element_type_mappings[tag].append(element_type)
return {
tag: element_types[0]
for tag, element_types in html_tag_to_element_type_mappings.items()
if len(element_types) == 1
}
def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
"""
Get a mapping of ontology element names to unstructured type names.
Get a mapping of ontology element to unstructured type.
The dictionary here was created base on ontology mapping json
Can be generated via the following code:
@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
```
Returns:
dict: A dictionary where keys are ontology element class names
and values are unstructured type names.
dict: A dictionary where keys are ontology element classes
and values are unstructured types.
"""
ontology_to_unstructured_class_mapping = {
"Document": "UncategorizedText",
"Section": "UncategorizedText",
"Page": "UncategorizedText",
"Column": "UncategorizedText",
"Paragraph": "NarrativeText",
"Header": "Header",
"Footer": "Footer",
"Sidebar": "UncategorizedText",
"PageBreak": "PageBreak",
"Title": "Title",
"Subtitle": "Title",
"Heading": "Title",
"NarrativeText": "NarrativeText",
"Quote": "NarrativeText",
"Footnote": "UncategorizedText",
"Caption": "FigureCaption",
"PageNumber": "PageNumber",
"UncategorizedText": "UncategorizedText",
"OrderedList": "UncategorizedText",
"UnorderedList": "UncategorizedText",
"DefinitionList": "UncategorizedText",
"ListItem": "ListItem",
"Table": "Table",
"TableRow": "Table",
"TableCell": "Table",
"TableCellHeader": "Table",
"TableBody": "Table",
"TableHeader": "Table",
"Image": "Image",
"Figure": "Image",
"Video": "UncategorizedText",
"Audio": "UncategorizedText",
"Barcode": "Image",
"QRCode": "Image",
"Logo": "Image",
"CodeBlock": "CodeSnippet",
"InlineCode": "CodeSnippet",
"Formula": "Formula",
"Equation": "Formula",
"FootnoteReference": "UncategorizedText",
"Citation": "UncategorizedText",
"Bibliography": "UncategorizedText",
"Glossary": "UncategorizedText",
"Author": "UncategorizedText",
"MetaDate": "UncategorizedText",
"Keywords": "UncategorizedText",
"Abstract": "NarrativeText",
"Hyperlink": "UncategorizedText",
"TableOfContents": "UncategorizedText",
"Index": "UncategorizedText",
"Form": "UncategorizedText",
"FormField": "UncategorizedText",
"FormFieldValue": "UncategorizedText",
"Checkbox": "UncategorizedText",
"RadioButton": "UncategorizedText",
"Button": "UncategorizedText",
"Comment": "UncategorizedText",
"Highlight": "UncategorizedText",
"RevisionInsertion": "UncategorizedText",
"RevisionDeletion": "UncategorizedText",
"Address": "Address",
"EmailAddress": "EmailAddress",
"PhoneNumber": "UncategorizedText",
"CalendarDate": "UncategorizedText",
"Time": "UncategorizedText",
"Currency": "UncategorizedText",
"Measurement": "UncategorizedText",
"Letterhead": "Header",
"Signature": "UncategorizedText",
"Watermark": "UncategorizedText",
"Stamp": "UncategorizedText",
ontology.Document: elements.Text,
ontology.Section: elements.Text,
ontology.Page: elements.Text,
ontology.Column: elements.Text,
ontology.Paragraph: elements.NarrativeText,
ontology.Header: elements.Header,
ontology.Footer: elements.Footer,
ontology.Sidebar: elements.Text,
ontology.PageBreak: elements.PageBreak,
ontology.Title: elements.Title,
ontology.Subtitle: elements.Title,
ontology.Heading: elements.Title,
ontology.NarrativeText: elements.NarrativeText,
ontology.Quote: elements.NarrativeText,
ontology.Footnote: elements.Text,
ontology.Caption: elements.FigureCaption,
ontology.PageNumber: elements.PageNumber,
ontology.UncategorizedText: elements.Text,
ontology.OrderedList: elements.Text,
ontology.UnorderedList: elements.Text,
ontology.DefinitionList: elements.Text,
ontology.ListItem: elements.ListItem,
ontology.Table: elements.Table,
ontology.TableRow: elements.Table,
ontology.TableCell: elements.Table,
ontology.TableCellHeader: elements.Table,
ontology.TableBody: elements.Table,
ontology.TableHeader: elements.Table,
ontology.Image: elements.Image,
ontology.Figure: elements.Image,
ontology.Video: elements.Text,
ontology.Audio: elements.Text,
ontology.Barcode: elements.Image,
ontology.QRCode: elements.Image,
ontology.Logo: elements.Image,
ontology.CodeBlock: elements.CodeSnippet,
ontology.InlineCode: elements.CodeSnippet,
ontology.Formula: elements.Formula,
ontology.Equation: elements.Formula,
ontology.FootnoteReference: elements.Text,
ontology.Citation: elements.Text,
ontology.Bibliography: elements.Text,
ontology.Glossary: elements.Text,
ontology.Author: elements.Text,
ontology.MetaDate: elements.Text,
ontology.Keywords: elements.Text,
ontology.Abstract: elements.NarrativeText,
ontology.Hyperlink: elements.Text,
ontology.TableOfContents: elements.Text,
ontology.Index: elements.Text,
ontology.Form: elements.Text,
ontology.FormField: elements.Text,
ontology.FormFieldValue: elements.Text,
ontology.Checkbox: elements.Text,
ontology.RadioButton: elements.Text,
ontology.Button: elements.Text,
ontology.Comment: elements.Text,
ontology.Highlight: elements.Text,
ontology.RevisionInsertion: elements.Text,
ontology.RevisionDeletion: elements.Text,
ontology.Address: elements.Address,
ontology.EmailAddress: elements.EmailAddress,
ontology.PhoneNumber: elements.Text,
ontology.CalendarDate: elements.Text,
ontology.Time: elements.Text,
ontology.Currency: elements.Text,
ontology.Measurement: elements.Text,
ontology.Letterhead: elements.Header,
ontology.Signature: elements.Text,
ontology.Watermark: elements.Text,
ontology.Stamp: elements.Text,
}
return ontology_to_unstructured_class_mapping
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(OntologyElement)
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[OntologyElement]] = {
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(ontology.OntologyElement)
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[ontology.OntologyElement]] = {
(tag, element_type().css_class_name): element_type
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
for tag in element_type().allowed_tags
}
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = {
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
element_type().css_class_name: element_type
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
for tag in element_type().allowed_tags
}
EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = get_exclusive_html_tags()
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping()
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
"body": ontology.Document,
"footer": ontology.Footer,
"aside": ontology.Sidebar,
"hr": ontology.PageBreak,
"h3": ontology.Heading,
"h4": ontology.Heading,
"h5": ontology.Heading,
"h6": ontology.Heading,
"blockquote": ontology.Quote,
"figcaption": ontology.Caption,
"ol": ontology.OrderedList,
"li": ontology.ListItem,
"tbody": ontology.TableBody,
"thead": ontology.TableHeader,
"tr": ontology.TableRow,
"td": ontology.TableCell,
"th": ontology.TableCellHeader,
"figure": ontology.Figure,
"video": ontology.Video,
"audio": ontology.Audio,
"pre": ontology.CodeBlock,
"sub": ontology.FootnoteReference,
"cite": ontology.Citation,
"nav": ontology.Index,
"form": ontology.Form,
"label": ontology.FormField,
"button": ontology.Button,
"mark": ontology.Highlight,
"ins": ontology.RevisionInsertion,
"del": ontology.RevisionDeletion,
"address": ontology.Address,
"table": ontology.Table,
}
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()

View File

@ -7,45 +7,24 @@ from typing import Sequence, Type
from bs4 import BeautifulSoup, Tag
from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
Element,
ElementMetadata,
Text,
)
from unstructured.documents import elements, ontology
from unstructured.documents.mappings import (
CSS_CLASS_TO_ELEMENT_TYPE_MAP,
EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP,
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP,
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME,
)
from unstructured.documents.ontology import (
Bibliography,
Citation,
Document,
ElementTypeEnum,
Footnote,
FootnoteReference,
Glossary,
Hyperlink,
NarrativeText,
OntologyElement,
Page,
Paragraph,
Quote,
UncategorizedText,
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
)
RECURSION_LIMIT = 50
def ontology_to_unstructured_elements(
ontology_element: OntologyElement,
ontology_element: ontology.OntologyElement,
parent_id: str = None,
page_number: int = None,
depth: int = 0,
filename: str | None = None,
) -> list[Element]:
) -> list[elements.Element]:
"""
Converts an OntologyElement object to a list of unstructured Element objects.
@ -70,18 +49,18 @@ def ontology_to_unstructured_elements(
list[Element]: A list of unstructured Element objects.
"""
elements_to_return = []
if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
if page_number is None and isinstance(ontology_element, Page):
if page_number is None and isinstance(ontology_element, ontology.Page):
page_number = ontology_element.page_number
if not isinstance(ontology_element, Document):
if not isinstance(ontology_element, ontology.Document):
elements_to_return += [
Text(
elements.Text(
text="",
element_id=ontology_element.id,
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
metadata=elements.ElementMetadata(
parent_id=parent_id,
text_as_html=ontology_element.to_html(add_children=False),
page_number=page_number,
@ -96,7 +75,7 @@ def ontology_to_unstructured_elements(
child,
parent_id=ontology_element.id,
page_number=page_number,
depth=0 if isinstance(ontology_element, Document) else depth + 1,
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename,
)
children += child
@ -104,10 +83,7 @@ def ontology_to_unstructured_elements(
combined_children = combine_inline_elements(children)
elements_to_return += combined_children
else:
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
ontology_element.__class__.__name__
]
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html()
element_text = ontology_element.to_text()
@ -115,7 +91,7 @@ def ontology_to_unstructured_elements(
text=element_text,
element_id=ontology_element.id,
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
metadata=elements.ElementMetadata(
parent_id=parent_id,
text_as_html=html_code_of_ontology_element,
page_number=page_number,
@ -128,7 +104,7 @@ def ontology_to_unstructured_elements(
return elements_to_return
def combine_inline_elements(elements: list[Element]) -> list[Element]:
def combine_inline_elements(elements: list[elements.Element]) -> list[elements.Element]:
"""
Combines consecutive inline elements into a single element. Inline elements
can be also combined with text elements.
@ -168,7 +144,9 @@ def combine_inline_elements(elements: list[Element]) -> list[Element]:
return result_elements
def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool:
def can_unstructured_elements_be_merged(
current_element: elements.Element, next_element: elements.Element
) -> bool:
"""
Elements can be merged when:
- They are on the same level in the HTML tree
@ -200,20 +178,20 @@ def can_unstructured_elements_be_merged(current_element: Element, next_element:
return True
def is_text_element(ontology_element: OntologyElement) -> bool:
def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
"""Categories or classes that we want to combine with inline text"""
text_classes = [
NarrativeText,
Quote,
Paragraph,
Footnote,
FootnoteReference,
Citation,
Bibliography,
Glossary,
ontology.NarrativeText,
ontology.Quote,
ontology.Paragraph,
ontology.Footnote,
ontology.FootnoteReference,
ontology.Citation,
ontology.Bibliography,
ontology.Glossary,
]
text_categories = [ElementTypeEnum.metadata]
text_categories = [ontology.ElementTypeEnum.metadata]
if any(isinstance(ontology_element, class_) for class_ in text_classes):
return True
@ -224,11 +202,14 @@ def is_text_element(ontology_element: OntologyElement) -> bool:
return False
def is_inline_element(ontology_element: OntologyElement) -> bool:
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
"""Categories or classes that we want to combine with text elements"""
inline_classes = [Hyperlink]
inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation]
inline_classes = [ontology.Hyperlink]
inline_categories = [
ontology.ElementTypeEnum.specialized_text,
ontology.ElementTypeEnum.annotation,
]
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
return True
@ -239,7 +220,9 @@ def is_inline_element(ontology_element: OntologyElement) -> bool:
return False
def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement:
def unstructured_elements_to_ontology(
unstructured_elements: Sequence[elements.Element],
) -> ontology.OntologyElement:
"""
Converts a sequence of unstructured Element objects to an OntologyElement object.
@ -260,10 +243,10 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
document_element_id = unstructured_elements[0].metadata.parent_id
if document_element_id is None:
document_element_id = OntologyElement.generate_unique_id()
document_element_id = ontology.OntologyElement.generate_unique_id()
unstructured_elements[0].metadata.parent_id = document_element_id
id_to_element_mapping[document_element_id] = Document(
id_to_element_mapping[document_element_id] = ontology.Document(
additional_attributes={"id": document_element_id}
)
@ -288,7 +271,7 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
return root_element
def parse_html_to_ontology(html_code: str) -> OntologyElement:
def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
"""
Parses the given HTML code and converts it into an Element object.
@ -356,7 +339,9 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
return str(soup)
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
def parse_html_to_ontology_element(
soup: Tag, recursion_depth: int = 1
) -> ontology.OntologyElement | None:
"""
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
@ -375,7 +360,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
escaped_attrs = get_escaped_attributes(soup)
if soup.name == "br": # Note(Pluto) should it be <br class="UncategorizedText">?
return Paragraph(
return ontology.Paragraph(
text="",
css_class_name=None,
html_tag_name="br",
@ -383,9 +368,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
)
has_children = (
(ontology_class != UncategorizedText)
(ontology_class != ontology.UncategorizedText)
and any(isinstance(content, Tag) for content in soup.contents)
or ontology_class().elementType == ElementTypeEnum.layout
or ontology_class().elementType == ontology.ElementTypeEnum.layout
)
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
@ -395,7 +380,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
(
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
if isinstance(child, Tag)
else Paragraph(text=str(child).strip())
else ontology.Paragraph(text=str(child).strip())
)
for child in soup.children
if str(child).strip()
@ -414,7 +399,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol
return output_element
def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[OntologyElement]]:
def extract_tag_and_ontology_class_from_tag(
soup: Tag,
) -> tuple[str, Type[ontology.OntologyElement]]:
"""
Extracts the HTML tag and corresponding ontology class
from a BeautifulSoup Tag object. The CSS class is prioritized over
@ -445,8 +432,8 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo
html_tag = element_class().allowed_tags[0]
# Scenario 3: CSS class incorrect, but HTML tag correct and exclusive in ontology
if not element_class and soup.name in EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP:
html_tag, element_class = soup.name, EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP[soup.name]
if not element_class and soup.name in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP:
html_tag, element_class = soup.name, HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[soup.name]
# Scenario 4: CSS class incorrect, HTML tag incorrect
# Fallback to default UncategorizedText
@ -455,7 +442,7 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo
# e.g. parent=FormField soup.name=input -> element=FormFieldInput
html_tag = "span"
element_class = UncategorizedText
element_class = ontology.UncategorizedText
return html_tag, element_class