diff --git a/CHANGELOG.md b/CHANGELOG.md index 2369344cb..aa63a03da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### Features +* **Add a set of new `ElementType`s to extend future element types** + ### Fixes * **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value. diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 61de0f54d..fdd70466d 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -18,6 +18,7 @@ from unstructured.documents.coordinates import ( ) from unstructured.documents.elements import ( UUID, + CheckBox, ConsolidationStrategy, CoordinatesMetadata, DataSourceMetadata, @@ -72,6 +73,14 @@ def test_text_element_apply_multiple_cleaners(): assert str(text_element) == "A Textbook on Crocodile Habitats" +def test_non_text_elements_are_serializable_to_text(): + element = CheckBox() + assert hasattr(element, "text") + assert element.text is not None + assert element.text == "" + assert str(element) == "" + + def test_apply_raises_if_func_does_not_produce_string(): def bad_cleaner(s: str): return 1 diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 57f672584..31e831877 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -14,6 +14,7 @@ from unstructured_inference.inference.layoutelement import LayoutElement from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import ( + TYPE_TO_TEXT_ELEMENT_MAP, CheckBox, CoordinatesMetadata, ElementMetadata, @@ -207,30 +208,48 @@ def test_normalize_layout_element_layout_element_narrative_text(): ) -def test_normalize_layout_element_checked_box(): +@pytest.mark.parametrize( + ("element_type", "expected_element_class"), + TYPE_TO_TEXT_ELEMENT_MAP.items(), +) +def test_normalize_layout_element_layout_element_maps_to_appropriate_text_element( + element_type: str, + expected_element_class: type[Text], +): layout_element = LayoutElement.from_coords( - type="Checked", + type=element_type, x1=1, y1=2, x2=3, y2=4, - text="", + text="Some lovely text", ) coordinate_system = PixelSpace(width=10, height=20) element = common.normalize_layout_element( layout_element, coordinate_system=coordinate_system, ) - assert element == CheckBox( - checked=True, + assert element == expected_element_class( + text="Some lovely text", coordinates=((1, 2), (1, 4), (3, 4), (3, 2)), coordinate_system=coordinate_system, ) -def test_normalize_layout_element_unchecked_box(): +@pytest.mark.parametrize( + ("element_type", "expected_checked"), + [ + (ElementType.CHECK_BOX_UNCHECKED, False), + (ElementType.CHECK_BOX_CHECKED, True), + (ElementType.RADIO_BUTTON_UNCHECKED, False), + (ElementType.RADIO_BUTTON_CHECKED, True), + (ElementType.CHECKED, True), + (ElementType.UNCHECKED, False), + ], +) +def test_normalize_layout_element_checkable(element_type: str, expected_checked: bool): layout_element = LayoutElement.from_coords( - type="Unchecked", + type=element_type, x1=1, y1=2, x2=3, @@ -242,8 +261,9 @@ def test_normalize_layout_element_unchecked_box(): layout_element, coordinate_system=coordinate_system, ) + assert isinstance(element, CheckBox) assert element == CheckBox( - checked=False, + checked=expected_checked, coordinates=((1, 2), (1, 4), (3, 4), (3, 2)), coordinate_system=coordinate_system, ) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 002e7f91c..c3f33eeff 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -597,6 +597,7 @@ class ElementType: UNCATEGORIZED_TEXT = "UncategorizedText" NARRATIVE_TEXT = "NarrativeText" BULLETED_TEXT = "BulletedText" + PARAGRAPH = "Paragraph" ABSTRACT = "Abstract" THREADING = "Threading" FORM = "Form" @@ -614,6 +615,10 @@ class ElementType: LIST_ITEM_OTHER = "List-item" CHECKED = "Checked" UNCHECKED = "Unchecked" + CHECK_BOX_CHECKED = "CheckBoxChecked" + CHECK_BOX_UNCHECKED = "CheckBoxUnchecked" + RADIO_BUTTON_CHECKED = "RadioButtonChecked" + RADIO_BUTTON_UNCHECKED = "RadioButtonUnchecked" ADDRESS = "Address" EMAIL_ADDRESS = "EmailAddress" PAGE_BREAK = "PageBreak" @@ -627,6 +632,8 @@ class ElementType: FOOTER = "Footer" FOOTNOTE = "Footnote" PAGE_FOOTER = "Page-footer" + PAGE_NUMBER = "PageNumber" + CODE_SNIPPET = "CodeSnippet" @classmethod def to_dict(cls): @@ -707,6 +714,9 @@ class Element(abc.ABC): return new_coordinates + def __str__(self): + return self.text + class CheckBox(Element): """A checkbox with an attribute indicating whether its checked or not. @@ -798,9 +808,6 @@ class Text(Element): ), ) - def __str__(self): - return self.text - def to_dict(self) -> dict[str, Any]: """Serialize to JSON-compatible (str keys) dict.""" out = super().to_dict() @@ -912,6 +919,18 @@ class Footer(Text): category = "Footer" +class CodeSnippet(Text): + """An element for capturing code snippets.""" + + category = "CodeSnippet" + + +class PageNumber(Text): + """An element for capturing page numbers.""" + + category = "PageNumber" + + TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = { ElementType.TITLE: Title, ElementType.SECTION_HEADER: Title, @@ -922,6 +941,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = { ElementType.COMPOSITE_ELEMENT: Text, ElementType.TEXT: NarrativeText, ElementType.NARRATIVE_TEXT: NarrativeText, + ElementType.PARAGRAPH: NarrativeText, # this mapping favors ensures yolox produces backward compatible categories ElementType.ABSTRACT: NarrativeText, ElementType.THREADING: NarrativeText, @@ -946,4 +966,6 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = { ElementType.EMAIL_ADDRESS: EmailAddress, ElementType.FORMULA: Formula, ElementType.PAGE_BREAK: PageBreak, + ElementType.CODE_SNIPPET: CodeSnippet, + ElementType.PAGE_NUMBER: PageNumber, } diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 35321b30f..944c2ecda 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -39,7 +39,6 @@ if TYPE_CHECKING: from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement - HIERARCHY_RULE_SET = { "Title": [ "Text", @@ -132,22 +131,22 @@ def normalize_layout_element( class_prob_metadata = ElementMetadata(detection_class_prob=float(prob)) # type: ignore else: class_prob_metadata = ElementMetadata() + common_kwargs = { + "coordinates": coordinates, + "coordinate_system": coordinate_system, + "metadata": class_prob_metadata, + "detection_origin": origin, + } if element_type == ElementType.LIST: if infer_list_items: return layout_list_to_list_items( text, - coordinates=coordinates, - coordinate_system=coordinate_system, - metadata=class_prob_metadata, - detection_origin=origin, + **common_kwargs, ) else: return ListItem( text=text, - coordinates=coordinates, - coordinate_system=coordinate_system, - metadata=class_prob_metadata, - detection_origin=origin, + **common_kwargs, ) elif element_type in TYPE_TO_TEXT_ELEMENT_MAP: @@ -155,39 +154,34 @@ def normalize_layout_element( _element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type] _element_class = _element_class( text=text, - coordinates=coordinates, - coordinate_system=coordinate_system, - metadata=class_prob_metadata, - detection_origin=origin, + **common_kwargs, ) if element_type == ElementType.HEADLINE: _element_class.metadata.category_depth = 1 elif element_type == ElementType.SUB_HEADLINE: _element_class.metadata.category_depth = 2 return _element_class - elif element_type == ElementType.CHECKED: + elif element_type in [ + ElementType.CHECK_BOX_CHECKED, + ElementType.CHECK_BOX_UNCHECKED, + ElementType.RADIO_BUTTON_CHECKED, + ElementType.RADIO_BUTTON_UNCHECKED, + ElementType.CHECKED, + ElementType.UNCHECKED, + ]: + checked = element_type in [ + ElementType.CHECK_BOX_CHECKED, + ElementType.RADIO_BUTTON_CHECKED, + ElementType.CHECKED, + ] return CheckBox( - checked=True, - coordinates=coordinates, - coordinate_system=coordinate_system, - metadata=class_prob_metadata, - detection_origin=origin, - ) - elif element_type == ElementType.UNCHECKED: - return CheckBox( - checked=False, - coordinates=coordinates, - coordinate_system=coordinate_system, - metadata=class_prob_metadata, - detection_origin=origin, + checked=checked, + **common_kwargs, ) else: return Text( text=text, - coordinates=coordinates, - coordinate_system=coordinate_system, - metadata=class_prob_metadata, - detection_origin=origin, + **common_kwargs, )