Refactor: element type (#2035)

### Summary - add constants for element type - replace the `TYPE_TO_TEXT_ELEMENT_MAP` dictionary using the `ElementType` constants - replace element type strings using the constants ### Testing CI should pass.
2025-12-26 14:45:31 +00:00 · 2023-11-08 21:52:55 -08:00 · 2023-11-08 21:52:55 -08:00 · bb58c1bb0b
commit bb58c1bb0b
parent c688216b38
11 changed files with 111 additions and 55 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -18,7 +18,7 @@

 ### Enhancements

-* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
+* **Adds include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
 * **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
 * **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
 * **Add support for generic partition configs in ingest cli** Along with the explicit partition options supported by the cli, an `additional_partition_args` arg was added to allow users to pass in any other arguments that should be added when calling partition(). This helps keep any changes to the input parameters of the partition() exposed in the CLI.
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@ -15,6 +15,7 @@ from unstructured.documents.elements import (
    Address,
    CompositeElement,
    Element,
+    ElementType,
    Footer,
    Header,
    ListItem,
@ -120,7 +121,11 @@ def test_parition_docx_from_team_chat():
        "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
        "saved-by  Dennis Forsythe",
    ]
-    assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"]
+    assert [e.category for e in elements] == [
+        ElementType.UNCATEGORIZED_TEXT,
+        ElementType.UNCATEGORIZED_TEXT,
+        ElementType.TABLE,
+    ]


 def test_partition_docx_from_filename(
--- a/test_unstructured/partition/markdown/test_md.py
+++ b/test_unstructured/partition/markdown/test_md.py
@ -7,7 +7,7 @@ import requests

 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
-from unstructured.documents.elements import Title
+from unstructured.documents.elements import ElementType, Title
 from unstructured.partition.md import partition_md
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA

@ -295,4 +295,4 @@ def test_partition_md_parse_table():
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md")
    elements = partition_md(filename=filename)
    assert len(elements) > 0
-    assert elements[0].category == "Table"
+    assert elements[0].category == ElementType.TABLE
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -9,6 +9,7 @@ from unstructured_inference.inference import layout

 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
+from unstructured.documents.elements import ElementType
 from unstructured.partition import image, ocr, pdf
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.utils import only
@ -126,7 +127,9 @@ def test_partition_image_with_auto_strategy(
    filename="example-docs/layout-parser-paper-fast.jpg",
 ):
    elements = image.partition_image(filename=filename, strategy="auto")
-    titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
+    titles = [
+        el for el in elements if el.category == ElementType.TITLE and len(el.text.split(" ")) > 10
+    ]
    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    idx = 3
    assert titles[0].text == title
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@ -10,6 +10,7 @@ from unstructured_inference.inference.layoutelement import (
    LayoutElement,
 )

+from unstructured.documents.elements import ElementType
 from unstructured.partition import ocr
 from unstructured.partition.ocr import pad_element_bboxes
 from unstructured.partition.utils.constants import (
@ -248,7 +249,7 @@ def test_get_elements_from_ocr_regions(mock_embedded_text_regions):
            x2=1256.334784222222,
            y2=406.9837855555556,
            text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
-            type="UncategorizedText",
+            type=ElementType.UNCATEGORIZED_TEXT,
        ),
    ]

@ -271,7 +272,7 @@ def test_zoom_image(zoom):
@pytest.fixture()
 def mock_layout(mock_embedded_text_regions):
    return [
-        LayoutElement(text=r.text, type="UncategorizedText", bbox=r.bbox)
+        LayoutElement(text=r.text, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox)
        for r in mock_embedded_text_regions
    ]

@ -354,7 +355,7 @@ def mock_embedded_text_regions():

 def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
    ocr_elements = [
-        LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox)
+        LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox)
        for r in mock_ocr_regions
    ]

@ -379,7 +380,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):

 def test_merge_out_layout_with_ocr_layout(mock_out_layout, mock_ocr_regions):
    ocr_elements = [
-        LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox)
+        LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox)
        for r in mock_ocr_regions
    ]

@ -410,7 +411,7 @@ def test_pad_element_bboxes(padding, expected_bbox):
        y2=40,
        text="",
        source=None,
-        type="UncategorizedText",
+        type=ElementType.UNCATEGORIZED_TEXT,
    )
    expected_original_element_bbox = (10, 20, 30, 40)

--- a/test_unstructured/partition/test_common.py
+++ b/test_unstructured/partition/test_common.py
@ -12,6 +12,7 @@ from unstructured.documents.elements import (
    CheckBox,
    CoordinatesMetadata,
    ElementMetadata,
+    ElementType,
    FigureCaption,
    Header,
    ListItem,
@ -515,7 +516,7 @@ def test_ocr_data_to_elements(
            bbox=r.bbox,
            text=r.text,
            source=r.source,
-            type="UncategorizedText",
+            type=ElementType.UNCATEGORIZED_TEXT,
        )
        for r in text_regions
    ]
@ -527,7 +528,7 @@ def test_ocr_data_to_elements(
    )

    assert len(ocr_data) == len(elements)
-    assert {el.category for el in elements} == {"UncategorizedText"}
+    assert {el.category for el in elements} == {ElementType.UNCATEGORIZED_TEXT}

    # check coordinates metadata
    image_width, image_height = image.size
--- a/test_unstructured/staging/test_base_staging.py
+++ b/test_unstructured/staging/test_base_staging.py
@ -15,6 +15,7 @@ from unstructured.documents.elements import (
    CoordinateSystem,
    DataSourceMetadata,
    ElementMetadata,
+    ElementType,
    FigureCaption,
    Image,
    Link,
@ -40,7 +41,7 @@ def test_convert_to_isd():
    isd = base.convert_to_isd(elements)

    assert isd[0]["text"] == "Title 1"
-    assert isd[0]["type"] == "Title"
+    assert isd[0]["type"] == ElementType.TITLE

    assert isd[1]["text"] == "Narrative 1"
    assert isd[1]["type"] == "NarrativeText"
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -676,38 +676,76 @@ class Footer(Text):
    category = "Footer"


+class ElementType:
+    TITLE = "Title"
+    TEXT = "Text"
+    UNCATEGORIZED_TEXT = "UncategorizedText"
+    NARRATIVE_TEXT = "NarrativeText"
+    BULLETED_TEXT = "BulletedText"
+    ABSTRACT = "Abstract"
+    THREADING = "Threading"
+    FORM = "Form"
+    FIELD_NAME = "Field-Name"
+    VALUE = "Value"
+    LINK = "Link"
+    COMPOSITE_ELEMENT = "CompositeElement"
+    IMAGE = "Image"
+    PICTURE = "Picture"
+    FIGURE_CAPTION = "FigureCaption"
+    FIGURE = "Figure"
+    CAPTION = "Caption"
+    LIST = "List"
+    LIST_ITEM = "ListItem"
+    LIST_ITEM_OTHER = "List-item"
+    CHECKED = "Checked"
+    UNCHECKED = "Unchecked"
+    ADDRESS = "Address"
+    EMAIL_ADDRESS = "EmailAddress"
+    PAGE_BREAK = "PageBreak"
+    FORMULA = "Formula"
+    TABLE = "Table"
+    HEADER = "Header"
+    HEADLINE = "Headline"
+    SUB_HEADLINE = "Subheadline"
+    PAGE_HEADER = "Page-header"  # Title?
+    SECTION_HEADER = "Section-header"
+    FOOTER = "Footer"
+    FOOTNOTE = "Footnote"
+    PAGE_FOOTER = "Page-footer"
+
+
 TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
-    "UncategorizedText": Text,
-    "FigureCaption": FigureCaption,
-    "Figure": FigureCaption,
-    "Text": NarrativeText,
-    "NarrativeText": NarrativeText,
-    "ListItem": ListItem,
-    "BulletedText": ListItem,
-    "Title": Title,
-    "Address": Address,
-    "EmailAddress": EmailAddress,
-    "Image": Image,
-    "PageBreak": PageBreak,
-    "Table": Table,
-    "Header": Header,
-    "Footer": Footer,
-    "Caption": FigureCaption,
-    "Footnote": Footer,
-    "Formula": Formula,
-    "List-item": ListItem,
-    "Page-footer": Footer,
-    "Page-header": Header,  # Title?
-    "Picture": Image,
+    ElementType.UNCATEGORIZED_TEXT: Text,
+    ElementType.FIGURE_CAPTION: FigureCaption,
+    ElementType.FIGURE: FigureCaption,
+    ElementType.TEXT: NarrativeText,
+    ElementType.NARRATIVE_TEXT: NarrativeText,
+    ElementType.LIST_ITEM: ListItem,
+    ElementType.BULLETED_TEXT: ListItem,
+    ElementType.TITLE: Title,
+    ElementType.ADDRESS: Address,
+    ElementType.EMAIL_ADDRESS: EmailAddress,
+    ElementType.IMAGE: Image,
+    ElementType.PAGE_BREAK: PageBreak,
+    ElementType.TABLE: Table,
+    ElementType.HEADER: Header,
+    ElementType.FOOTER: Footer,
+    ElementType.CAPTION: FigureCaption,
+    ElementType.FOOTNOTE: Footer,
+    ElementType.FORMULA: Formula,
+    ElementType.LIST_ITEM_OTHER: ListItem,
+    ElementType.PAGE_FOOTER: Footer,
+    ElementType.PAGE_HEADER: Header,  # Title?
+    ElementType.PICTURE: Image,
    # this mapping favors ensures yolox produces backward compatible categories
-    "Section-header": Title,
-    "Headline": Title,
-    "Subheadline": Title,
-    "Abstract": NarrativeText,
-    "Threading": NarrativeText,
-    "Form": NarrativeText,
-    "Field-Name": Title,
-    "Value": NarrativeText,
-    "Link": NarrativeText,
-    "CompositeElement": Text,
+    ElementType.SECTION_HEADER: Title,
+    ElementType.HEADLINE: Title,
+    ElementType.SUB_HEADLINE: Title,
+    ElementType.ABSTRACT: NarrativeText,
+    ElementType.THREADING: NarrativeText,
+    ElementType.FORM: NarrativeText,
+    ElementType.FIELD_NAME: Title,
+    ElementType.VALUE: NarrativeText,
+    ElementType.LINK: NarrativeText,
+    ElementType.COMPOSITE_ELEMENT: Text,
 }
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -28,6 +28,7 @@ from unstructured.documents.elements import (
    CoordinatesMetadata,
    Element,
    ElementMetadata,
+    ElementType,
    ListItem,
    PageBreak,
    Text,
@ -136,7 +137,7 @@ def normalize_layout_element(
        class_prob_metadata = ElementMetadata(detection_class_prob=float(prob))  # type: ignore
    else:
        class_prob_metadata = ElementMetadata()
-    if element_type == "List":
+    if element_type == ElementType.LIST:
        if infer_list_items:
            return layout_list_to_list_items(
                text,
@ -163,12 +164,12 @@ def normalize_layout_element(
            metadata=class_prob_metadata,
            detection_origin=origin,
        )
-        if element_type == "Headline":
+        if element_type == ElementType.HEADLINE:
            _element_class.metadata.category_depth = 1
-        elif element_type == "Subheadline":
+        elif element_type == ElementType.SUB_HEADLINE:
            _element_class.metadata.category_depth = 2
        return _element_class
-    elif element_type == "Checked":
+    elif element_type == ElementType.CHECKED:
        return CheckBox(
            checked=True,
            coordinates=coordinates,
@ -176,7 +177,7 @@ def normalize_layout_element(
            metadata=class_prob_metadata,
            detection_origin=origin,
        )
-    elif element_type == "Unchecked":
+    elif element_type == ElementType.UNCHECKED:
        return CheckBox(
            checked=False,
            coordinates=coordinates,
--- a/unstructured/partition/ocr.py
+++ b/unstructured/partition/ocr.py
@ -22,6 +22,7 @@ from unstructured_inference.inference.layoutelement import (
 from unstructured_inference.models.tables import UnstructuredTableTransformerModel
 from unstructured_pytesseract import Output

+from unstructured.documents.elements import ElementType
 from unstructured.logger import logger
 from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import (
@ -256,7 +257,7 @@ def supplement_element_with_table_extraction(
    the table's text content is rendered into an html string.
    """
    for element in elements:
-        if element.type == "Table":
+        if element.type == ElementType.TABLE:
            padding = env_config.TABLE_IMAGE_CROP_PAD
            padded_element = pad_element_bboxes(element, padding=padding)
            cropped_image = image.crop(
@ -368,7 +369,9 @@ def get_layout_elements_from_ocr(
        # and merging steps are not necessary.

        layout_elements = [
-            LayoutElement(bbox=r.bbox, text=r.text, source=r.source, type="UncategorizedText")
+            LayoutElement(
+                bbox=r.bbox, text=r.text, source=r.source, type=ElementType.UNCATEGORIZED_TEXT
+            )
            for r in ocr_regions
        ]
    else:
@ -762,7 +765,9 @@ def get_elements_from_ocr_regions(

    merged_regions = [merge_text_regions(group) for group in grouped_regions]
    return [
-        LayoutElement(text=r.text, source=r.source, type="UncategorizedText", bbox=r.bbox)
+        LayoutElement(
+            text=r.text, source=r.source, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox
+        )
        for r in merged_regions
    ]

--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -45,6 +45,7 @@ from unstructured.documents.elements import (
    CoordinatesMetadata,
    Element,
    ElementMetadata,
+    ElementType,
    Image,
    Link,
    ListItem,
@ -323,7 +324,7 @@ def partition_pdf_or_image(
            )
            layout_elements = []
            for el in _layout_elements:
-                if hasattr(el, "category") and el.category == "UncategorizedText":
+                if hasattr(el, "category") and el.category == ElementType.UNCATEGORIZED_TEXT:
                    new_el = element_from_text(cast(Text, el).text)
                    new_el.metadata = el.metadata
                else:
@ -348,7 +349,7 @@ def partition_pdf_or_image(

            layout_elements = []
            for el in _layout_elements:
-                if hasattr(el, "category") and el.category == "UncategorizedText":
+                if hasattr(el, "category") and el.category == ElementType.UNCATEGORIZED_TEXT:
                    new_el = element_from_text(cast(Text, el).text)
                    new_el.metadata = el.metadata
                else: