feat: Adds a helper function to convert ISD dicts to elements (#39)

* updated category name for ListItem

* added brick to convert isd to elements

* bump version

* added isd_to_elements to documentation
This commit is contained in:
Matt Robinson 2022-10-21 14:43:10 -04:00 committed by GitHub
parent 2871941a80
commit de31df51a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 64 additions and 6 deletions

View File

@ -1,5 +1,6 @@
## 0.2.1-dev9
## 0.2.1
* Added brick to convert an ISD dictionary to a list of elements
* Update `PDFDocument` to use the `from_file` method
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
* Added staging brick for separating text into attention window size chunks for `transformers`.

View File

@ -338,6 +338,27 @@ Examples:
isd = convert_to_isd(elements)
``isd_to_elements``
-------------------
Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
Examples:
.. code:: python
from unstructured.staging.base import isd_to_elements
isd = [
{"text": "My Title", "type": "Title"},
{"text": "My Narrative", "type": "NarrativeText"}
]
# elements will look like:
# [ Title(text="My Title"), NarrativeText(text="My Narrative")]
elements = isd_to_elements(isd)
``convert_to_isd_csv``
----------------------

View File

@ -4,7 +4,7 @@ import csv
import unstructured.staging.base as base
from unstructured.documents.elements import Title, NarrativeText
from unstructured.documents.elements import Title, NarrativeText, ListItem
@pytest.fixture
@ -23,6 +23,23 @@ def test_convert_to_isd():
assert isd[1]["type"] == "NarrativeText"
def test_isd_to_elements():
isd = [
{"text": "Blurb1", "type": "NarrativeText"},
{"text": "Blurb2", "type": "Title"},
{"text": "Blurb3", "type": "ListItem"},
{"text": "Blurb4", "type": "BulletedText"},
]
elements = base.isd_to_elements(isd)
assert elements == [
NarrativeText(text="Blurb1"),
Title(text="Blurb2"),
ListItem(text="Blurb3"),
ListItem(text="Blurb4"),
]
def test_convert_to_isd_csv(output_csv_file):
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]

View File

@ -1 +1 @@
__version__ = "0.2.1-dev9" # pragma: no cover
__version__ = "0.2.1" # pragma: no cover

View File

@ -47,9 +47,11 @@ class NarrativeText(Text):
class ListItem(Text):
"""BulletedText is a NarrativeText element that is part of a bulleted list."""
"""ListItem is a NarrativeText element that is part of a list."""
category = "BulletedText"
category = "ListItem"
pass
class Title(Text):

View File

@ -2,7 +2,7 @@ import io
import csv
from typing import Dict, List
from unstructured.documents.elements import Text
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
return isd
def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
"""Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
elements: List[Text] = list()
for item in isd:
if item["type"] == "NarrativeText":
elements.append(NarrativeText(text=item["text"]))
elif item["type"] == "Title":
elements.append(Title(text=item["text"]))
# NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
# to be called BulletedText in an earlier version
elif item["type"] in ["ListItem", "BulletedText"]:
elements.append(ListItem(text=item["text"]))
return elements
def convert_to_isd_csv(elements: List[Text]) -> str:
"""
Returns the representation of document elements as an Initial Structured Document (ISD)