mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 20:57:50 +00:00
feat: Implement staging brick for ISD CSV format (#36)
* Implement convert_to_isd_csv function * Add unit tests for convert_to_isd_csv function * Update docs with description and example of convert_to_isd_csv function * Update changelog and version
This commit is contained in:
parent
fb16847946
commit
2d5dba0ddc
@ -1,5 +1,6 @@
|
|||||||
## 0.2.1-dev7
|
## 0.2.1-dev8
|
||||||
|
|
||||||
|
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
||||||
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
||||||
* Added staging brick for LabelBox.
|
* Added staging brick for LabelBox.
|
||||||
* Added ability to upload LabelStudio predictions
|
* Added ability to upload LabelStudio predictions
|
||||||
|
@ -338,6 +338,22 @@ Examples:
|
|||||||
isd = convert_to_isd(elements)
|
isd = convert_to_isd(elements)
|
||||||
|
|
||||||
|
|
||||||
|
``convert_to_isd_csv``
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Converts outputs to the initial structured data (ISD) format as a CSV string.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Title, NarrativeText
|
||||||
|
from unstructured.staging.base import convert_to_isd_csv
|
||||||
|
|
||||||
|
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
|
||||||
|
isd_csv = convert_to_isd_csv(elements)
|
||||||
|
|
||||||
|
|
||||||
``stage_for_transformers``
|
``stage_for_transformers``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
@ -422,7 +438,6 @@ The following optional keyword arguments can be specified in
|
|||||||
results = [nlp(chunk) for chunk in chunks]
|
results = [nlp(chunk) for chunk in chunks]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
``stage_for_label_studio``
|
``stage_for_label_studio``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
@ -1,8 +1,17 @@
|
|||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
import csv
|
||||||
|
|
||||||
import unstructured.staging.base as base
|
import unstructured.staging.base as base
|
||||||
|
|
||||||
from unstructured.documents.elements import Title, NarrativeText
|
from unstructured.documents.elements import Title, NarrativeText
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def output_csv_file(tmp_path):
|
||||||
|
return os.path.join(tmp_path, "isd_data.csv")
|
||||||
|
|
||||||
|
|
||||||
def test_convert_to_isd():
|
def test_convert_to_isd():
|
||||||
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||||
isd = base.convert_to_isd(elements)
|
isd = base.convert_to_isd(elements)
|
||||||
@ -12,3 +21,16 @@ def test_convert_to_isd():
|
|||||||
|
|
||||||
assert isd[1]["text"] == "Narrative 1"
|
assert isd[1]["text"] == "Narrative 1"
|
||||||
assert isd[1]["type"] == "NarrativeText"
|
assert isd[1]["type"] == "NarrativeText"
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_to_isd_csv(output_csv_file):
|
||||||
|
|
||||||
|
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||||
|
with open(output_csv_file, "w+") as csv_file:
|
||||||
|
isd_csv_string = base.convert_to_isd_csv(elements)
|
||||||
|
csv_file.write(isd_csv_string)
|
||||||
|
|
||||||
|
fieldnames = ["type", "text"]
|
||||||
|
with open(output_csv_file, "r") as csv_file:
|
||||||
|
csv_rows = csv.DictReader(csv_file)
|
||||||
|
assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.2.1-dev7" # pragma: no cover
|
__version__ = "0.2.1-dev8" # pragma: no cover
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import io
|
||||||
|
import csv
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
from unstructured.documents.elements import Text
|
from unstructured.documents.elements import Text
|
||||||
@ -10,3 +12,17 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
|
|||||||
section = dict(text=element.text, type=element.category)
|
section = dict(text=element.text, type=element.category)
|
||||||
isd.append(section)
|
isd.append(section)
|
||||||
return isd
|
return isd
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_isd_csv(elements: List[Text]) -> str:
|
||||||
|
"""
|
||||||
|
Returns the representation of document elements as an Initial Structured Document (ISD)
|
||||||
|
in CSV Format.
|
||||||
|
"""
|
||||||
|
csv_fieldnames: List[str] = ["type", "text"]
|
||||||
|
rows: List[Dict[str, str]] = convert_to_isd(elements)
|
||||||
|
with io.StringIO() as buffer:
|
||||||
|
csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames)
|
||||||
|
csv_writer.writeheader()
|
||||||
|
csv_writer.writerows(rows)
|
||||||
|
return buffer.getvalue()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user