feat: Optionally include LabelStudio annotations in staging brick (#19)

* added types for label studio annotations * added method to cast as dicts * added length check for annotations * tweaks to get upload to work * added validation for label types * annotations is a list for each example * little bit of refactoring * test for staging with label studio * tests for error conditions and reviewers * added test for NER annotations * updated changelog and bumped version * added docs with annotation examples * fix label studio link * bump version in sphinx docs * fulle -> full (typo fix)
2025-11-02 11:03:38 +00:00 · 2022-10-04 09:25:05 -04:00 · 2022-10-04 09:25:05 -04:00 · a950559b94
commit a950559b94
parent 29607c32ba
6 changed files with 377 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,8 @@
-## 0.2.1-dev2
+## 0.2.1-dev3

 * Added staging brick for CSV format for Prodigy
 * Added staging brick for Prodigy
+* Added ability to upload LabelStudio annotations
 * Added text_field and id_field to stage_for_label_studio signature

 ## 0.2.0
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -361,6 +361,98 @@ Examples:
      json.dump(label_studio_data, f, indent=4)


+You can also include pre-annotations as part of your LabelStudio upload. The
+``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
+annotations for each element in the ``elements`` list. If an element does not have any annotations,
+use an empty list.
+The following shows an example of how to upload annotations for the "Text Classification"
+task in LabelStudio:
+
+.. code:: python
+
+  import json
+
+  from unstructured.documents.elements import NarrativeText
+  from unstructured.staging.label_studio import (
+      stage_for_label_studio,
+      LabelStudioAnnotation,
+      LabelStudioResult,
+  )
+
+
+
+  elements = [NarrativeText(text="Narrative")]
+  annotations = [[
+    LabelStudioAnnotation(
+        result=[
+            LabelStudioResult(
+                type="choices",
+                value={"choices": ["Positive"]},
+                from_name="sentiment",
+                to_name="text",
+            )
+        ]
+    )
+  ]]
+  label_studio_data = stage_for_label_studio(
+      elements,
+      annotations=annotations,
+      text_field="my_text",
+      id_field="my_id"
+  )
+
+  # The resulting JSON file is ready to be uploaded to LabelStudio
+  # with annotations included
+  with open("label_studio.json", "w") as f:
+      json.dump(label_studio_data, f, indent=4)
+
+
+The following shows an example of how to upload annotations for the "Named Entity Recognition"
+task in LabelStudio:
+
+.. code:: python
+
+  import json
+
+  from unstructured.documents.elements import NarrativeText
+  from unstructured.staging.label_studio import (
+      stage_for_label_studio,
+      LabelStudioAnnotation,
+      LabelStudioResult,
+  )
+
+
+
+  elements = [NarrativeText(text="Narrative")]
+  annotations = [[
+    LabelStudioAnnotation(
+        result=[
+            LabelStudioResult(
+                type="labels",
+                value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]},
+                from_name="label",
+                to_name="text",
+            )
+        ]
+    )
+  ]]
+  label_studio_data = stage_for_label_studio(
+      elements,
+      annotations=annotations,
+      text_field="my_text",
+      id_field="my_id"
+  )
+
+  # The resulting JSON file is ready to be uploaded to LabelStudio
+  # with annotations included
+  with open("label_studio.json", "w") as f:
+      json.dump(label_studio_data, f, indent=4)
+
+
+See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full list of options
+for labels and annotations.
+
+
 ``stage_for_prodigy``
 --------------------------

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies'
 author = 'Unstructured Technologies'

 # The full version, including alpha/beta/rc tags
-release = '0.0.1'
+release = '0.2.1-dev3'


 # -- General configuration ---------------------------------------------------
--- a/test_unstructured/staging/test_label_studio.py
+++ b/test_unstructured/staging/test_label_studio.py
@ -28,3 +28,175 @@ def test_specify_text_name(elements):
 def test_specify_id_name(elements):
    label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id")
    assert "random_id" in label_studio_data[0]["data"]
+
+
+def test_created_annotation():
+    annotation = label_studio.LabelStudioAnnotation(
+        result=[
+            label_studio.LabelStudioResult(
+                type="choices",
+                value={"choices": ["Positive"]},
+                from_name="sentiment",
+                to_name="text",
+            )
+        ]
+    )
+
+    annotation.to_dict() == {
+        "result": [
+            {
+                "type": "choices",
+                "value": {"choices": ["Positive"]},
+                "from_name": "sentiment",
+                "id": None,
+                "to_name": "text",
+                "hidden": False,
+                "read_only": False,
+            }
+        ],
+        "was_canceled": False,
+    }
+
+
+def test_stage_with_annotation():
+    element = NarrativeText(text="A big brown bear")
+    annotations = [
+        label_studio.LabelStudioAnnotation(
+            result=[
+                label_studio.LabelStudioResult(
+                    type="choices",
+                    value={"choices": ["Positive"]},
+                    from_name="sentiment",
+                    to_name="text",
+                )
+            ]
+        )
+    ]
+    label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
+    assert label_studio_data == [
+        {
+            "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
+            "annotations": [
+                {
+                    "result": [
+                        {
+                            "type": "choices",
+                            "value": {"choices": ["Positive"]},
+                            "from_name": "sentiment",
+                            "id": None,
+                            "to_name": "text",
+                            "hidden": False,
+                            "read_only": False,
+                        }
+                    ],
+                    "was_canceled": False,
+                }
+            ],
+        }
+    ]
+
+
+def test_stage_with_annotation_for_ner():
+    element = NarrativeText(text="A big brown bear")
+    annotations = [
+        label_studio.LabelStudioAnnotation(
+            result=[
+                label_studio.LabelStudioResult(
+                    type="labels",
+                    value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
+                    from_name="label",
+                    to_name="text",
+                )
+            ]
+        )
+    ]
+    label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
+    assert label_studio_data == [
+        {
+            "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
+            "annotations": [
+                {
+                    "result": [
+                        {
+                            "type": "labels",
+                            "value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
+                            "from_name": "label",
+                            "id": None,
+                            "to_name": "text",
+                            "hidden": False,
+                            "read_only": False,
+                        }
+                    ],
+                    "was_canceled": False,
+                }
+            ],
+        }
+    ]
+
+
+def test_stage_with_annotation_raises_with_mismatched_lengths():
+    element = NarrativeText(text="A big brown bear")
+    annotations = [
+        label_studio.LabelStudioAnnotation(
+            result=[
+                label_studio.LabelStudioResult(
+                    type="choices",
+                    value={"choices": ["Positive"]},
+                    from_name="sentiment",
+                    to_name="text",
+                )
+            ]
+        )
+    ]
+    with pytest.raises(ValueError):
+        label_studio.stage_for_label_studio([element], [annotations, annotations])
+
+
+def test_stage_with_annotation_raises_with_invalid_type():
+    with pytest.raises(ValueError):
+        label_studio.LabelStudioResult(
+            type="bears",
+            value={"bears": ["Positive"]},
+            from_name="sentiment",
+            to_name="text",
+        )
+
+
+def test_stage_with_reviewed_annotation():
+    element = NarrativeText(text="A big brown bear")
+    annotations = [
+        label_studio.LabelStudioAnnotation(
+            result=[
+                label_studio.LabelStudioResult(
+                    type="choices",
+                    value={"choices": ["Positive"]},
+                    from_name="sentiment",
+                    to_name="text",
+                )
+            ],
+            reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)],
+        )
+    ]
+    label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
+    assert label_studio_data == [
+        {
+            "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
+            "annotations": [
+                {
+                    "result": [
+                        {
+                            "type": "choices",
+                            "value": {"choices": ["Positive"]},
+                            "from_name": "sentiment",
+                            "to_name": "text",
+                            "id": None,
+                            "hidden": False,
+                            "read_only": False,
+                        }
+                    ],
+                    "reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}],
+                    "was_canceled": False,
+                }
+            ],
+        }
+    ]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.2.1-dev2"  # pragma: no cover
+__version__ = "0.2.1-dev3"  # pragma: no cover
--- a/unstructured/staging/label_studio.py
+++ b/unstructured/staging/label_studio.py
@ -1,21 +1,126 @@
-from typing import Dict, List
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union

 from unstructured.documents.elements import Text


 LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]]

+# NOTE(robinson) - ref: https://labelstud.io/tags/labels.html
+VALID_LABEL_TYPES = [
+    "labels",
+    "hypertextlabels",
+    "paragraphlabels",
+    "rectangle",
+    "keypoint",
+    "polygon",
+    "brush",
+    "ellipse",
+    "rectanglelabels",
+    "keypointlabels",
+    "polygonlabels",
+    "brushlabels",
+    "ellipselabels",
+    "timeserieslabels",
+    "choices",
+    "number",
+    "taxonomy",
+    "textarea",
+    "rating",
+    "pairwise",
+    "videorectangle",
+]
+
+
+@dataclass
+class LabelStudioResult:
+    """Class for representing a LabelStudio annotation result.
+    ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
+
+    type: str  # The type of tag used to annotate the task
+    value: Dict[str, Any]  # The values for
+    from_name: str  # Name of the source object tag (i.e. "sentiment" for the sentiment template)
+    to_name: str  # Name of the destination control tag
+    id: Optional[str] = None
+    hidden: bool = False
+    read_only: bool = False
+
+    def __post_init__(self):
+        if self.type not in VALID_LABEL_TYPES:
+            raise ValueError(
+                f"{self.type} is not a valid label type. "
+                f"Valid label types are: {VALID_LABEL_TYPES}"
+            )
+
+    def to_dict(self):
+        return self.__dict__
+
+
+@dataclass
+class LabelStudioReview:
+    """Class for representing a LablStudio review. Reviews are only available in the
+    Enterprise offering.
+    ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
+
+    created_by: Dict[str, Union[str, int]]
+    accepted: bool
+    id: Optional[str] = None
+
+    def to_dict(self):
+        return self.__dict__
+
+
+@dataclass
+class LabelStudioAnnotation:
+    """Class for representing LabelStudio annotations.
+    ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
+
+    result: List[LabelStudioResult]  # The result of the annotation
+    id: Optional[str] = None
+    lead_time: Optional[float] = None  # Time in seconds to label the task
+    completed_by: Optional[int] = None  # User ID for the user who completed the task
+    reviews: Optional[List[LabelStudioReview]] = None  # An array of the review results
+    was_canceled: bool = False  # Indicates whether or not the annotation was canceled
+
+    def to_dict(self):
+        annotation_dict = deepcopy(self.__dict__)
+        annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]]
+        if "reviews" in annotation_dict and annotation_dict["reviews"] is not None:
+            annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]]
+
+        # NOTE(robinson) - Removes keys for any fields that defaulted to None
+        _annotation_dict = deepcopy(annotation_dict)
+        for key, value in annotation_dict.items():
+            if value is None:
+                _annotation_dict.pop(key)
+
+        return _annotation_dict
+

 def stage_for_label_studio(
-    elements: List[Text], text_field: str = "text", id_field: str = "ref_id"
+    elements: List[Text],
+    annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
+    text_field: str = "text",
+    id_field: str = "ref_id",
 ) -> LABEL_STUDIO_TYPE:
    """Converts the document to the format required for upload to LabelStudio.
    ref: https://labelstud.io/guide/tasks.html#Example-JSON-format"""
+    if annotations is not None:
+        if len(elements) != len(annotations):
+            raise ValueError("The length of elements and annotations must match.")
+
    label_studio_data: LABEL_STUDIO_TYPE = list()
-    for element in elements:
+    for i, element in enumerate(elements):
        data: Dict[str, str] = dict()
        data[text_field] = element.text
        if isinstance(element.id, str):
            data[id_field] = element.id
-        label_studio_data.append({"data": data})
+
+        labeling_example: Dict[str, Any] = dict()
+        labeling_example["data"] = data
+        if annotations is not None:
+            labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
+        label_studio_data.append(labeling_example)
+
    return label_studio_data