From 64e1c725ebd620219253026c2c02124051300dea Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Wed, 28 Sep 2022 09:30:17 -0500 Subject: [PATCH] feat: Add text_field and id_field to stage_for_label_studio signature (#9) Added text_field and id_field to stage_for_label_studio signature, to allow user to specify the keys in the resulting JSON. Includes tests and update to example in sphinx docs. --- CHANGELOG.md | 4 ++++ docs/source/bricks.rst | 2 +- .../staging/test_label_studio.py | 24 +++++++++++++++---- unstructured/__version__.py | 2 +- unstructured/staging/label_studio.py | 8 ++++--- 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4941e0e72..b571b581d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.1-dev0 + +* Added text_field and id_field to stage_for_label_studio signature + ## 0.2.0 * Initial release of unstructured diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 89610d04d..3d5787d4d 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -354,7 +354,7 @@ Examples: from unstructured.staging.label_studio import stage_for_label_studio elements = [Title(text="Title"), NarrativeText(text="Narrative")] - label_studio_data = stage_for_label_studio(elements) + label_studio_data = stage_for_label_studio(elements, text_field="my_text", id_field="my_id") # The resulting JSON file is ready to be uploaded to LabelStudio with open("label_studio.json", "w") as f: diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py index e94d4c110..644f1b132 100644 --- a/test_unstructured/staging/test_label_studio.py +++ b/test_unstructured/staging/test_label_studio.py @@ -1,14 +1,30 @@ +import pytest import unstructured.staging.label_studio as label_studio from unstructured.documents.elements import Title, NarrativeText -def test_convert_to_label_studio_data(): - elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")] +@pytest.fixture +def elements(): + return [Title(text="Title 1"), NarrativeText(text="Narrative 1")] + + +def test_convert_to_label_studio_data(elements): label_studio_data = label_studio.stage_for_label_studio(elements) - assert label_studio_data[0]["data"]["my_text"] == "Title 1" + assert label_studio_data[0]["data"]["text"] == "Title 1" assert "ref_id" in label_studio_data[0]["data"] - assert label_studio_data[1]["data"]["my_text"] == "Narrative 1" + assert label_studio_data[1]["data"]["text"] == "Narrative 1" assert "ref_id" in label_studio_data[1]["data"] + + +def test_specify_text_name(elements): + label_studio_data = label_studio.stage_for_label_studio(elements, text_field="random_text") + assert "random_text" in label_studio_data[0]["data"] + assert label_studio_data[0]["data"]["random_text"] == "Title 1" + + +def test_specify_id_name(elements): + label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id") + assert "random_id" in label_studio_data[0]["data"] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 23894ce6c..aefc45561 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.0" # pragma: no cover +__version__ = "0.2.1-dev0" # pragma: no cover diff --git a/unstructured/staging/label_studio.py b/unstructured/staging/label_studio.py index 3bd59a1f2..1e83633ac 100644 --- a/unstructured/staging/label_studio.py +++ b/unstructured/staging/label_studio.py @@ -6,14 +6,16 @@ from unstructured.documents.elements import Text LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] -def stage_for_label_studio(elements: List[Text]) -> LABEL_STUDIO_TYPE: +def stage_for_label_studio( + elements: List[Text], text_field: str = "text", id_field: str = "ref_id" +) -> LABEL_STUDIO_TYPE: """Converts the document to the format required for upload to LabelStudio. ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" label_studio_data: LABEL_STUDIO_TYPE = list() for element in elements: data: Dict[str, str] = dict() - data["my_text"] = element.text + data[text_field] = element.text if isinstance(element.id, str): - data["ref_id"] = element.id + data[id_field] = element.id label_studio_data.append({"data": data}) return label_studio_data