diff --git a/CHANGELOG.md b/CHANGELOG.md index f463a7ae4..5d003e5c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.2.1-dev7 +## 0.2.1-dev8 +* Added staging brick for CSV format for ISD (Initial Structured Data) format. * Added staging brick for separating text into attention window size chunks for `transformers`. * Added staging brick for LabelBox. * Added ability to upload LabelStudio predictions diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index bb8547b54..a1b38cf97 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -338,6 +338,22 @@ Examples: isd = convert_to_isd(elements) +``convert_to_isd_csv`` +---------------------- + +Converts outputs to the initial structured data (ISD) format as a CSV string. + +Examples: + +.. code:: python + + from unstructured.documents.elements import Title, NarrativeText + from unstructured.staging.base import convert_to_isd_csv + + elements = [Title(text="Title"), NarrativeText(text="Narrative")] + isd_csv = convert_to_isd_csv(elements) + + ``stage_for_transformers`` -------------------------- @@ -422,7 +438,6 @@ The following optional keyword arguments can be specified in results = [nlp(chunk) for chunk in chunks] - ``stage_for_label_studio`` -------------------------- diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py index 3b3e14868..f84cf4e85 100644 --- a/test_unstructured/staging/test_base_staging.py +++ b/test_unstructured/staging/test_base_staging.py @@ -1,8 +1,17 @@ +import os +import pytest +import csv + import unstructured.staging.base as base from unstructured.documents.elements import Title, NarrativeText +@pytest.fixture +def output_csv_file(tmp_path): + return os.path.join(tmp_path, "isd_data.csv") + + def test_convert_to_isd(): elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")] isd = base.convert_to_isd(elements) @@ -12,3 +21,16 @@ def test_convert_to_isd(): assert isd[1]["text"] == "Narrative 1" assert isd[1]["type"] == "NarrativeText" + + +def test_convert_to_isd_csv(output_csv_file): + + elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")] + with open(output_csv_file, "w+") as csv_file: + isd_csv_string = base.convert_to_isd_csv(elements) + csv_file.write(isd_csv_string) + + fieldnames = ["type", "text"] + with open(output_csv_file, "r") as csv_file: + csv_rows = csv.DictReader(csv_file) + assert all(set(row.keys()) == set(fieldnames) for row in csv_rows) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 27bbd8537..5aed66a07 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.1-dev7" # pragma: no cover +__version__ = "0.2.1-dev8" # pragma: no cover diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index d715015e9..408472eee 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -1,3 +1,5 @@ +import io +import csv from typing import Dict, List from unstructured.documents.elements import Text @@ -10,3 +12,17 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]: section = dict(text=element.text, type=element.category) isd.append(section) return isd + + +def convert_to_isd_csv(elements: List[Text]) -> str: + """ + Returns the representation of document elements as an Initial Structured Document (ISD) + in CSV Format. + """ + csv_fieldnames: List[str] = ["type", "text"] + rows: List[Dict[str, str]] = convert_to_isd(elements) + with io.StringIO() as buffer: + csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames) + csv_writer.writeheader() + csv_writer.writerows(rows) + return buffer.getvalue()