mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-26 02:21:04 +00:00
feat: Adds a helper function to convert ISD dicts to elements (#39)
* updated category name for ListItem * added brick to convert isd to elements * bump version * added isd_to_elements to documentation
This commit is contained in:
parent
2871941a80
commit
de31df51a9
@ -1,5 +1,6 @@
|
|||||||
## 0.2.1-dev9
|
## 0.2.1
|
||||||
|
|
||||||
|
* Added brick to convert an ISD dictionary to a list of elements
|
||||||
* Update `PDFDocument` to use the `from_file` method
|
* Update `PDFDocument` to use the `from_file` method
|
||||||
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
||||||
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
||||||
|
@ -338,6 +338,27 @@ Examples:
|
|||||||
isd = convert_to_isd(elements)
|
isd = convert_to_isd(elements)
|
||||||
|
|
||||||
|
|
||||||
|
``isd_to_elements``
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.staging.base import isd_to_elements
|
||||||
|
|
||||||
|
isd = [
|
||||||
|
{"text": "My Title", "type": "Title"},
|
||||||
|
{"text": "My Narrative", "type": "NarrativeText"}
|
||||||
|
]
|
||||||
|
|
||||||
|
# elements will look like:
|
||||||
|
# [ Title(text="My Title"), NarrativeText(text="My Narrative")]
|
||||||
|
elements = isd_to_elements(isd)
|
||||||
|
|
||||||
|
|
||||||
``convert_to_isd_csv``
|
``convert_to_isd_csv``
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import csv
|
|||||||
|
|
||||||
import unstructured.staging.base as base
|
import unstructured.staging.base as base
|
||||||
|
|
||||||
from unstructured.documents.elements import Title, NarrativeText
|
from unstructured.documents.elements import Title, NarrativeText, ListItem
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -23,6 +23,23 @@ def test_convert_to_isd():
|
|||||||
assert isd[1]["type"] == "NarrativeText"
|
assert isd[1]["type"] == "NarrativeText"
|
||||||
|
|
||||||
|
|
||||||
|
def test_isd_to_elements():
|
||||||
|
isd = [
|
||||||
|
{"text": "Blurb1", "type": "NarrativeText"},
|
||||||
|
{"text": "Blurb2", "type": "Title"},
|
||||||
|
{"text": "Blurb3", "type": "ListItem"},
|
||||||
|
{"text": "Blurb4", "type": "BulletedText"},
|
||||||
|
]
|
||||||
|
|
||||||
|
elements = base.isd_to_elements(isd)
|
||||||
|
assert elements == [
|
||||||
|
NarrativeText(text="Blurb1"),
|
||||||
|
Title(text="Blurb2"),
|
||||||
|
ListItem(text="Blurb3"),
|
||||||
|
ListItem(text="Blurb4"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_convert_to_isd_csv(output_csv_file):
|
def test_convert_to_isd_csv(output_csv_file):
|
||||||
|
|
||||||
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.2.1-dev9" # pragma: no cover
|
__version__ = "0.2.1" # pragma: no cover
|
||||||
|
@ -47,9 +47,11 @@ class NarrativeText(Text):
|
|||||||
|
|
||||||
|
|
||||||
class ListItem(Text):
|
class ListItem(Text):
|
||||||
"""BulletedText is a NarrativeText element that is part of a bulleted list."""
|
"""ListItem is a NarrativeText element that is part of a list."""
|
||||||
|
|
||||||
category = "BulletedText"
|
category = "ListItem"
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Title(Text):
|
class Title(Text):
|
||||||
|
@ -2,7 +2,7 @@ import io
|
|||||||
import csv
|
import csv
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
from unstructured.documents.elements import Text
|
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
|
||||||
|
|
||||||
|
|
||||||
def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
|
def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
|
||||||
@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
|
|||||||
return isd
|
return isd
|
||||||
|
|
||||||
|
|
||||||
|
def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
|
||||||
|
"""Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
|
||||||
|
elements: List[Text] = list()
|
||||||
|
|
||||||
|
for item in isd:
|
||||||
|
if item["type"] == "NarrativeText":
|
||||||
|
elements.append(NarrativeText(text=item["text"]))
|
||||||
|
elif item["type"] == "Title":
|
||||||
|
elements.append(Title(text=item["text"]))
|
||||||
|
# NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
|
||||||
|
# to be called BulletedText in an earlier version
|
||||||
|
elif item["type"] in ["ListItem", "BulletedText"]:
|
||||||
|
elements.append(ListItem(text=item["text"]))
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def convert_to_isd_csv(elements: List[Text]) -> str:
|
def convert_to_isd_csv(elements: List[Text]) -> str:
|
||||||
"""
|
"""
|
||||||
Returns the representation of document elements as an Initial Structured Document (ISD)
|
Returns the representation of document elements as an Initial Structured Document (ISD)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user