feat: Adds a helper function to convert ISD dicts to elements (#39)

* updated category name for ListItem * added brick to convert isd to elements * bump version * added isd_to_elements to documentation
2025-12-11 07:01:24 +00:00 · 2022-10-21 14:43:10 -04:00 · 2022-10-21 14:43:10 -04:00 · de31df51a9
commit de31df51a9
parent 2871941a80
6 changed files with 64 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,6 @@
-## 0.2.1-dev9
+## 0.2.1

+* Added brick to convert an ISD dictionary to a list of elements
 * Update `PDFDocument` to use the `from_file` method
 * Added staging brick for CSV format for ISD (Initial Structured Data) format.
 * Added staging brick for separating text into attention window size chunks for `transformers`.
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -338,6 +338,27 @@ Examples:
  isd = convert_to_isd(elements)


+``isd_to_elements``
+-------------------
+
+Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.staging.base import isd_to_elements
+
+  isd = [
+    {"text": "My Title", "type": "Title"},
+    {"text": "My Narrative", "type": "NarrativeText"}
+  ]
+
+  # elements will look like:
+  # [ Title(text="My Title"), NarrativeText(text="My Narrative")]
+  elements = isd_to_elements(isd)
+
+
 ``convert_to_isd_csv``
 ----------------------

--- a/test_unstructured/staging/test_base_staging.py
+++ b/test_unstructured/staging/test_base_staging.py
@ -4,7 +4,7 @@ import csv

 import unstructured.staging.base as base

-from unstructured.documents.elements import Title, NarrativeText
+from unstructured.documents.elements import Title, NarrativeText, ListItem


@pytest.fixture
@ -23,6 +23,23 @@ def test_convert_to_isd():
    assert isd[1]["type"] == "NarrativeText"


+def test_isd_to_elements():
+    isd = [
+        {"text": "Blurb1", "type": "NarrativeText"},
+        {"text": "Blurb2", "type": "Title"},
+        {"text": "Blurb3", "type": "ListItem"},
+        {"text": "Blurb4", "type": "BulletedText"},
+    ]
+
+    elements = base.isd_to_elements(isd)
+    assert elements == [
+        NarrativeText(text="Blurb1"),
+        Title(text="Blurb2"),
+        ListItem(text="Blurb3"),
+        ListItem(text="Blurb4"),
+    ]
+
+
 def test_convert_to_isd_csv(output_csv_file):

    elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.2.1-dev9"  # pragma: no cover
+__version__ = "0.2.1"  # pragma: no cover
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -47,9 +47,11 @@ class NarrativeText(Text):


 class ListItem(Text):
-    """BulletedText is a NarrativeText element that is part of a bulleted list."""
+    """ListItem is a NarrativeText element that is part of a list."""

-    category = "BulletedText"
+    category = "ListItem"
+
+    pass


 class Title(Text):
--- a/unstructured/staging/base.py
+++ b/unstructured/staging/base.py
@ -2,7 +2,7 @@ import io
 import csv
 from typing import Dict, List

-from unstructured.documents.elements import Text
+from unstructured.documents.elements import Text, NarrativeText, Title, ListItem


 def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
    return isd


+def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
+    """Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
+    elements: List[Text] = list()
+
+    for item in isd:
+        if item["type"] == "NarrativeText":
+            elements.append(NarrativeText(text=item["text"]))
+        elif item["type"] == "Title":
+            elements.append(Title(text=item["text"]))
+        # NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
+        # to be called BulletedText in an earlier version
+        elif item["type"] in ["ListItem", "BulletedText"]:
+            elements.append(ListItem(text=item["text"]))
+
+    return elements
+
+
 def convert_to_isd_csv(elements: List[Text]) -> str:
    """
    Returns the representation of document elements as an Initial Structured Document (ISD)