mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
feat: element types extension (#2700)
This PR adds some new element types that can be used especially by pdf/image parition.
This commit is contained in:
parent
1ce60f2bba
commit
63fc2a1061
@ -4,6 +4,8 @@
|
||||
|
||||
### Features
|
||||
|
||||
* **Add a set of new `ElementType`s to extend future element types**
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value.
|
||||
|
@ -18,6 +18,7 @@ from unstructured.documents.coordinates import (
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
UUID,
|
||||
CheckBox,
|
||||
ConsolidationStrategy,
|
||||
CoordinatesMetadata,
|
||||
DataSourceMetadata,
|
||||
@ -72,6 +73,14 @@ def test_text_element_apply_multiple_cleaners():
|
||||
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
||||
|
||||
|
||||
def test_non_text_elements_are_serializable_to_text():
|
||||
element = CheckBox()
|
||||
assert hasattr(element, "text")
|
||||
assert element.text is not None
|
||||
assert element.text == ""
|
||||
assert str(element) == ""
|
||||
|
||||
|
||||
def test_apply_raises_if_func_does_not_produce_string():
|
||||
def bad_cleaner(s: str):
|
||||
return 1
|
||||
|
@ -14,6 +14,7 @@ from unstructured_inference.inference.layoutelement import LayoutElement
|
||||
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
TYPE_TO_TEXT_ELEMENT_MAP,
|
||||
CheckBox,
|
||||
CoordinatesMetadata,
|
||||
ElementMetadata,
|
||||
@ -207,30 +208,48 @@ def test_normalize_layout_element_layout_element_narrative_text():
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_layout_element_checked_box():
|
||||
@pytest.mark.parametrize(
|
||||
("element_type", "expected_element_class"),
|
||||
TYPE_TO_TEXT_ELEMENT_MAP.items(),
|
||||
)
|
||||
def test_normalize_layout_element_layout_element_maps_to_appropriate_text_element(
|
||||
element_type: str,
|
||||
expected_element_class: type[Text],
|
||||
):
|
||||
layout_element = LayoutElement.from_coords(
|
||||
type="Checked",
|
||||
type=element_type,
|
||||
x1=1,
|
||||
y1=2,
|
||||
x2=3,
|
||||
y2=4,
|
||||
text="",
|
||||
text="Some lovely text",
|
||||
)
|
||||
coordinate_system = PixelSpace(width=10, height=20)
|
||||
element = common.normalize_layout_element(
|
||||
layout_element,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
assert element == CheckBox(
|
||||
checked=True,
|
||||
assert element == expected_element_class(
|
||||
text="Some lovely text",
|
||||
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_layout_element_unchecked_box():
|
||||
@pytest.mark.parametrize(
|
||||
("element_type", "expected_checked"),
|
||||
[
|
||||
(ElementType.CHECK_BOX_UNCHECKED, False),
|
||||
(ElementType.CHECK_BOX_CHECKED, True),
|
||||
(ElementType.RADIO_BUTTON_UNCHECKED, False),
|
||||
(ElementType.RADIO_BUTTON_CHECKED, True),
|
||||
(ElementType.CHECKED, True),
|
||||
(ElementType.UNCHECKED, False),
|
||||
],
|
||||
)
|
||||
def test_normalize_layout_element_checkable(element_type: str, expected_checked: bool):
|
||||
layout_element = LayoutElement.from_coords(
|
||||
type="Unchecked",
|
||||
type=element_type,
|
||||
x1=1,
|
||||
y1=2,
|
||||
x2=3,
|
||||
@ -242,8 +261,9 @@ def test_normalize_layout_element_unchecked_box():
|
||||
layout_element,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
assert isinstance(element, CheckBox)
|
||||
assert element == CheckBox(
|
||||
checked=False,
|
||||
checked=expected_checked,
|
||||
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
|
@ -597,6 +597,7 @@ class ElementType:
|
||||
UNCATEGORIZED_TEXT = "UncategorizedText"
|
||||
NARRATIVE_TEXT = "NarrativeText"
|
||||
BULLETED_TEXT = "BulletedText"
|
||||
PARAGRAPH = "Paragraph"
|
||||
ABSTRACT = "Abstract"
|
||||
THREADING = "Threading"
|
||||
FORM = "Form"
|
||||
@ -614,6 +615,10 @@ class ElementType:
|
||||
LIST_ITEM_OTHER = "List-item"
|
||||
CHECKED = "Checked"
|
||||
UNCHECKED = "Unchecked"
|
||||
CHECK_BOX_CHECKED = "CheckBoxChecked"
|
||||
CHECK_BOX_UNCHECKED = "CheckBoxUnchecked"
|
||||
RADIO_BUTTON_CHECKED = "RadioButtonChecked"
|
||||
RADIO_BUTTON_UNCHECKED = "RadioButtonUnchecked"
|
||||
ADDRESS = "Address"
|
||||
EMAIL_ADDRESS = "EmailAddress"
|
||||
PAGE_BREAK = "PageBreak"
|
||||
@ -627,6 +632,8 @@ class ElementType:
|
||||
FOOTER = "Footer"
|
||||
FOOTNOTE = "Footnote"
|
||||
PAGE_FOOTER = "Page-footer"
|
||||
PAGE_NUMBER = "PageNumber"
|
||||
CODE_SNIPPET = "CodeSnippet"
|
||||
|
||||
@classmethod
|
||||
def to_dict(cls):
|
||||
@ -707,6 +714,9 @@ class Element(abc.ABC):
|
||||
|
||||
return new_coordinates
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
|
||||
class CheckBox(Element):
|
||||
"""A checkbox with an attribute indicating whether its checked or not.
|
||||
@ -798,9 +808,6 @@ class Text(Element):
|
||||
),
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Serialize to JSON-compatible (str keys) dict."""
|
||||
out = super().to_dict()
|
||||
@ -912,6 +919,18 @@ class Footer(Text):
|
||||
category = "Footer"
|
||||
|
||||
|
||||
class CodeSnippet(Text):
|
||||
"""An element for capturing code snippets."""
|
||||
|
||||
category = "CodeSnippet"
|
||||
|
||||
|
||||
class PageNumber(Text):
|
||||
"""An element for capturing page numbers."""
|
||||
|
||||
category = "PageNumber"
|
||||
|
||||
|
||||
TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
||||
ElementType.TITLE: Title,
|
||||
ElementType.SECTION_HEADER: Title,
|
||||
@ -922,6 +941,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
||||
ElementType.COMPOSITE_ELEMENT: Text,
|
||||
ElementType.TEXT: NarrativeText,
|
||||
ElementType.NARRATIVE_TEXT: NarrativeText,
|
||||
ElementType.PARAGRAPH: NarrativeText,
|
||||
# this mapping favors ensures yolox produces backward compatible categories
|
||||
ElementType.ABSTRACT: NarrativeText,
|
||||
ElementType.THREADING: NarrativeText,
|
||||
@ -946,4 +966,6 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
||||
ElementType.EMAIL_ADDRESS: EmailAddress,
|
||||
ElementType.FORMULA: Formula,
|
||||
ElementType.PAGE_BREAK: PageBreak,
|
||||
ElementType.CODE_SNIPPET: CodeSnippet,
|
||||
ElementType.PAGE_NUMBER: PageNumber,
|
||||
}
|
||||
|
@ -39,7 +39,6 @@ if TYPE_CHECKING:
|
||||
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
|
||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
||||
|
||||
|
||||
HIERARCHY_RULE_SET = {
|
||||
"Title": [
|
||||
"Text",
|
||||
@ -132,22 +131,22 @@ def normalize_layout_element(
|
||||
class_prob_metadata = ElementMetadata(detection_class_prob=float(prob)) # type: ignore
|
||||
else:
|
||||
class_prob_metadata = ElementMetadata()
|
||||
common_kwargs = {
|
||||
"coordinates": coordinates,
|
||||
"coordinate_system": coordinate_system,
|
||||
"metadata": class_prob_metadata,
|
||||
"detection_origin": origin,
|
||||
}
|
||||
if element_type == ElementType.LIST:
|
||||
if infer_list_items:
|
||||
return layout_list_to_list_items(
|
||||
text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
**common_kwargs,
|
||||
)
|
||||
else:
|
||||
return ListItem(
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
**common_kwargs,
|
||||
)
|
||||
|
||||
elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
|
||||
@ -155,39 +154,34 @@ def normalize_layout_element(
|
||||
_element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
|
||||
_element_class = _element_class(
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
**common_kwargs,
|
||||
)
|
||||
if element_type == ElementType.HEADLINE:
|
||||
_element_class.metadata.category_depth = 1
|
||||
elif element_type == ElementType.SUB_HEADLINE:
|
||||
_element_class.metadata.category_depth = 2
|
||||
return _element_class
|
||||
elif element_type == ElementType.CHECKED:
|
||||
elif element_type in [
|
||||
ElementType.CHECK_BOX_CHECKED,
|
||||
ElementType.CHECK_BOX_UNCHECKED,
|
||||
ElementType.RADIO_BUTTON_CHECKED,
|
||||
ElementType.RADIO_BUTTON_UNCHECKED,
|
||||
ElementType.CHECKED,
|
||||
ElementType.UNCHECKED,
|
||||
]:
|
||||
checked = element_type in [
|
||||
ElementType.CHECK_BOX_CHECKED,
|
||||
ElementType.RADIO_BUTTON_CHECKED,
|
||||
ElementType.CHECKED,
|
||||
]
|
||||
return CheckBox(
|
||||
checked=True,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
)
|
||||
elif element_type == ElementType.UNCHECKED:
|
||||
return CheckBox(
|
||||
checked=False,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
checked=checked,
|
||||
**common_kwargs,
|
||||
)
|
||||
else:
|
||||
return Text(
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
**common_kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user