mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: Adds a helper function to convert ISD dicts to elements (#39)
* updated category name for ListItem * added brick to convert isd to elements * bump version * added isd_to_elements to documentation
This commit is contained in:
parent
2871941a80
commit
de31df51a9
@ -1,5 +1,6 @@
|
||||
## 0.2.1-dev9
|
||||
## 0.2.1
|
||||
|
||||
* Added brick to convert an ISD dictionary to a list of elements
|
||||
* Update `PDFDocument` to use the `from_file` method
|
||||
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
||||
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
||||
|
@ -338,6 +338,27 @@ Examples:
|
||||
isd = convert_to_isd(elements)
|
||||
|
||||
|
||||
``isd_to_elements``
|
||||
-------------------
|
||||
|
||||
Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.staging.base import isd_to_elements
|
||||
|
||||
isd = [
|
||||
{"text": "My Title", "type": "Title"},
|
||||
{"text": "My Narrative", "type": "NarrativeText"}
|
||||
]
|
||||
|
||||
# elements will look like:
|
||||
# [ Title(text="My Title"), NarrativeText(text="My Narrative")]
|
||||
elements = isd_to_elements(isd)
|
||||
|
||||
|
||||
``convert_to_isd_csv``
|
||||
----------------------
|
||||
|
||||
|
@ -4,7 +4,7 @@ import csv
|
||||
|
||||
import unstructured.staging.base as base
|
||||
|
||||
from unstructured.documents.elements import Title, NarrativeText
|
||||
from unstructured.documents.elements import Title, NarrativeText, ListItem
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -23,6 +23,23 @@ def test_convert_to_isd():
|
||||
assert isd[1]["type"] == "NarrativeText"
|
||||
|
||||
|
||||
def test_isd_to_elements():
|
||||
isd = [
|
||||
{"text": "Blurb1", "type": "NarrativeText"},
|
||||
{"text": "Blurb2", "type": "Title"},
|
||||
{"text": "Blurb3", "type": "ListItem"},
|
||||
{"text": "Blurb4", "type": "BulletedText"},
|
||||
]
|
||||
|
||||
elements = base.isd_to_elements(isd)
|
||||
assert elements == [
|
||||
NarrativeText(text="Blurb1"),
|
||||
Title(text="Blurb2"),
|
||||
ListItem(text="Blurb3"),
|
||||
ListItem(text="Blurb4"),
|
||||
]
|
||||
|
||||
|
||||
def test_convert_to_isd_csv(output_csv_file):
|
||||
|
||||
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.2.1-dev9" # pragma: no cover
|
||||
__version__ = "0.2.1" # pragma: no cover
|
||||
|
@ -47,9 +47,11 @@ class NarrativeText(Text):
|
||||
|
||||
|
||||
class ListItem(Text):
|
||||
"""BulletedText is a NarrativeText element that is part of a bulleted list."""
|
||||
"""ListItem is a NarrativeText element that is part of a list."""
|
||||
|
||||
category = "BulletedText"
|
||||
category = "ListItem"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Title(Text):
|
||||
|
@ -2,7 +2,7 @@ import io
|
||||
import csv
|
||||
from typing import Dict, List
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
|
||||
|
||||
|
||||
def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
|
||||
@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
|
||||
return isd
|
||||
|
||||
|
||||
def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
|
||||
"""Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
|
||||
elements: List[Text] = list()
|
||||
|
||||
for item in isd:
|
||||
if item["type"] == "NarrativeText":
|
||||
elements.append(NarrativeText(text=item["text"]))
|
||||
elif item["type"] == "Title":
|
||||
elements.append(Title(text=item["text"]))
|
||||
# NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
|
||||
# to be called BulletedText in an earlier version
|
||||
elif item["type"] in ["ListItem", "BulletedText"]:
|
||||
elements.append(ListItem(text=item["text"]))
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def convert_to_isd_csv(elements: List[Text]) -> str:
|
||||
"""
|
||||
Returns the representation of document elements as an Initial Structured Document (ISD)
|
||||
|
Loading…
x
Reference in New Issue
Block a user