feat: Allow option to specify predictions in LabelStudio staging brick (#23)

* Allow stage_for_label_studio to take a predictions input and implement prediction class * Update unit tests for LabelStudioPrediction and stage_for_label_studio function * Update stage_for_label_studio docs with example of loading predictions * Bump version and update changelog Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
2025-12-24 05:34:58 +00:00 · 2022-10-06 18:35:55 +05:00 · 2022-10-06 18:35:55 +05:00 · baba641d03
commit baba641d03
parent 779e48bafe
5 changed files with 172 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,6 @@
-## 0.2.1-dev4
+## 0.2.1-dev5

+* Added ability to upload LabelStudio predictions
 * Added utility function for JSONL reading and writing
 * Added staging brick for CSV format for Prodigy
 * Added staging brick for Prodigy
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -361,8 +361,9 @@ Examples:
      json.dump(label_studio_data, f, indent=4)


-You can also include pre-annotations as part of your LabelStudio upload. The
-``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
+You can also include pre-annotations and predictions as part of your LabelStudio upload. 
+
+The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
 annotations for each element in the ``elements`` list. If an element does not have any annotations,
 use an empty list.
 The following shows an example of how to upload annotations for the "Text Classification"
@ -407,6 +408,52 @@ task in LabelStudio:
      json.dump(label_studio_data, f, indent=4)


+Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
+the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
+predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list. 
+The following shows an example of how to upload predictions for the "Text Classification"
+task in LabelStudio:
+
+.. code:: python
+
+  import json
+
+  from unstructured.documents.elements import NarrativeText
+  from unstructured.staging.label_studio import (
+      stage_for_label_studio,
+      LabelStudioPrediction,
+      LabelStudioResult,
+  )
+
+
+
+  elements = [NarrativeText(text="Narrative")]
+  predictions = [[
+    LabelStudioPrediction(
+        result=[
+            LabelStudioResult(
+                type="choices",
+                value={"choices": ["Positive"]},
+                from_name="sentiment",
+                to_name="text",
+            )
+        ],
+        score=0.68
+    )
+  ]]
+  label_studio_data = stage_for_label_studio(
+      elements,
+      predictions=predictions,
+      text_field="my_text",
+      id_field="my_id"
+  )
+
+  # The resulting JSON file is ready to be uploaded to LabelStudio
+  # with annotations included
+  with open("label_studio.json", "w") as f:
+      json.dump(label_studio_data, f, indent=4)
+
+
 The following shows an example of how to upload annotations for the "Named Entity Recognition"
 task in LabelStudio:

--- a/test_unstructured/staging/test_label_studio.py
+++ b/test_unstructured/staging/test_label_studio.py
@ -115,6 +115,49 @@ def test_created_annotation():
    }


+@pytest.mark.parametrize(
+    "score, raises, exception",
+    [
+        (None, True, ValueError),
+        (-0.25, True, ValueError),
+        (0, False, None),
+        (0.5, False, None),
+        (1, False, None),
+        (1.25, True, ValueError),
+    ],
+)
+def test_init_prediction(score, raises, exception):
+    result = [
+        label_studio.LabelStudioResult(
+            type="choices",
+            value={"choices": ["Positive"]},
+            from_name="sentiment",
+            to_name="text",
+        )
+    ]
+
+    if raises:
+        with pytest.raises(exception):
+            label_studio.LabelStudioPrediction(result=result, score=score)
+    else:
+        prediction = label_studio.LabelStudioPrediction(result=result, score=score)
+        prediction.to_dict() == {
+            "result": [
+                {
+                    "type": "choices",
+                    "value": {"choices": ["Positive"]},
+                    "from_name": "sentiment",
+                    "id": None,
+                    "to_name": "text",
+                    "hidden": False,
+                    "read_only": False,
+                }
+            ],
+            "was_canceled": False,
+            "score": score,
+        }
+
+
 def test_stage_with_annotation():
    element = NarrativeText(text="A big brown bear")
    annotations = [
@ -153,6 +196,46 @@ def test_stage_with_annotation():
    ]


+def test_stage_with_prediction():
+    element = NarrativeText(text="A big brown bear")
+    prediction = [
+        label_studio.LabelStudioPrediction(
+            result=[
+                label_studio.LabelStudioResult(
+                    type="choices",
+                    value={"choices": ["Positive"]},
+                    from_name="sentiment",
+                    to_name="text",
+                )
+            ],
+            score=0.98,
+        )
+    ]
+    label_studio_data = label_studio.stage_for_label_studio([element], predictions=[prediction])
+    assert label_studio_data == [
+        {
+            "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
+            "predictions": [
+                {
+                    "result": [
+                        {
+                            "type": "choices",
+                            "value": {"choices": ["Positive"]},
+                            "from_name": "sentiment",
+                            "id": None,
+                            "to_name": "text",
+                            "hidden": False,
+                            "read_only": False,
+                        }
+                    ],
+                    "was_canceled": False,
+                    "score": 0.98,
+                }
+            ],
+        }
+    ]
+
+
 def test_stage_with_annotation_for_ner():
    element = NarrativeText(text="A big brown bear")
    annotations = [
@ -209,6 +292,25 @@ def test_stage_with_annotation_raises_with_mismatched_lengths():
        label_studio.stage_for_label_studio([element], [annotations, annotations])


+def test_stage_with_prediction_raises_with_mismatched_lengths():
+    element = NarrativeText(text="A big brown bear")
+    prediction = [
+        label_studio.LabelStudioPrediction(
+            result=[
+                label_studio.LabelStudioResult(
+                    type="choices",
+                    value={"choices": ["Positive"]},
+                    from_name="sentiment",
+                    to_name="text",
+                )
+            ],
+            score=0.82,
+        )
+    ]
+    with pytest.raises(ValueError):
+        label_studio.stage_for_label_studio([element], predictions=[prediction, prediction])
+
+
 def test_stage_with_annotation_raises_with_invalid_type():
    with pytest.raises(ValueError):
        label_studio.LabelStudioResult(
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.2.1-dev4"  # pragma: no cover
+__version__ = "0.2.1-dev5"  # pragma: no cover
--- a/unstructured/staging/label_studio.py
+++ b/unstructured/staging/label_studio.py
@ -98,9 +98,22 @@ class LabelStudioAnnotation:
        return _annotation_dict


+@dataclass
+class LabelStudioPrediction(LabelStudioAnnotation):
+    score: float = 0
+
+    def __post_init__(self):
+        if not isinstance(self.score, (int, float)) or (self.score < 0 or self.score > 1):
+            raise ValueError(
+                f"{self.score} is not a valid score value. "
+                f"Score value must be a number between 0 and 1."
+            )
+
+
 def stage_for_label_studio(
    elements: List[Text],
    annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
+    predictions: Optional[List[List[LabelStudioPrediction]]] = None,
    text_field: str = "text",
    id_field: str = "ref_id",
 ) -> LABEL_STUDIO_TYPE:
@ -109,6 +122,9 @@ def stage_for_label_studio(
    if annotations is not None:
        if len(elements) != len(annotations):
            raise ValueError("The length of elements and annotations must match.")
+    if predictions is not None:
+        if len(elements) != len(predictions):
+            raise ValueError("The length of elements and predictions must match.")

    label_studio_data: LABEL_STUDIO_TYPE = list()
    for i, element in enumerate(elements):
@ -121,6 +137,8 @@ def stage_for_label_studio(
        labeling_example["data"] = data
        if annotations is not None:
            labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
+        if predictions is not None:
+            labeling_example["predictions"] = [a.to_dict() for a in predictions[i]]
        label_studio_data.append(labeling_example)

    return label_studio_data