feat: Allow option to specify predictions in LabelStudio staging brick (#23)

* Allow stage_for_label_studio to take a predictions input and implement prediction class

* Update unit tests for LabelStudioPrediction and stage_for_label_studio function

* Update stage_for_label_studio docs with example of loading predictions

* Bump version and update changelog

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
asymness 2022-10-06 18:35:55 +05:00 committed by GitHub
parent 779e48bafe
commit baba641d03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 172 additions and 4 deletions

View File

@ -1,5 +1,6 @@
## 0.2.1-dev4
## 0.2.1-dev5
* Added ability to upload LabelStudio predictions
* Added utility function for JSONL reading and writing
* Added staging brick for CSV format for Prodigy
* Added staging brick for Prodigy

View File

@ -361,8 +361,9 @@ Examples:
json.dump(label_studio_data, f, indent=4)
You can also include pre-annotations as part of your LabelStudio upload. The
``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
You can also include pre-annotations and predictions as part of your LabelStudio upload.
The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
annotations for each element in the ``elements`` list. If an element does not have any annotations,
use an empty list.
The following shows an example of how to upload annotations for the "Text Classification"
@ -407,6 +408,52 @@ task in LabelStudio:
json.dump(label_studio_data, f, indent=4)
Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list.
The following shows an example of how to upload predictions for the "Text Classification"
task in LabelStudio:
.. code:: python
import json
from unstructured.documents.elements import NarrativeText
from unstructured.staging.label_studio import (
stage_for_label_studio,
LabelStudioPrediction,
LabelStudioResult,
)
elements = [NarrativeText(text="Narrative")]
predictions = [[
LabelStudioPrediction(
result=[
LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
],
score=0.68
)
]]
label_studio_data = stage_for_label_studio(
elements,
predictions=predictions,
text_field="my_text",
id_field="my_id"
)
# The resulting JSON file is ready to be uploaded to LabelStudio
# with annotations included
with open("label_studio.json", "w") as f:
json.dump(label_studio_data, f, indent=4)
The following shows an example of how to upload annotations for the "Named Entity Recognition"
task in LabelStudio:

View File

@ -115,6 +115,49 @@ def test_created_annotation():
}
@pytest.mark.parametrize(
"score, raises, exception",
[
(None, True, ValueError),
(-0.25, True, ValueError),
(0, False, None),
(0.5, False, None),
(1, False, None),
(1.25, True, ValueError),
],
)
def test_init_prediction(score, raises, exception):
result = [
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
]
if raises:
with pytest.raises(exception):
label_studio.LabelStudioPrediction(result=result, score=score)
else:
prediction = label_studio.LabelStudioPrediction(result=result, score=score)
prediction.to_dict() == {
"result": [
{
"type": "choices",
"value": {"choices": ["Positive"]},
"from_name": "sentiment",
"id": None,
"to_name": "text",
"hidden": False,
"read_only": False,
}
],
"was_canceled": False,
"score": score,
}
def test_stage_with_annotation():
element = NarrativeText(text="A big brown bear")
annotations = [
@ -153,6 +196,46 @@ def test_stage_with_annotation():
]
def test_stage_with_prediction():
element = NarrativeText(text="A big brown bear")
prediction = [
label_studio.LabelStudioPrediction(
result=[
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
],
score=0.98,
)
]
label_studio_data = label_studio.stage_for_label_studio([element], predictions=[prediction])
assert label_studio_data == [
{
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
"predictions": [
{
"result": [
{
"type": "choices",
"value": {"choices": ["Positive"]},
"from_name": "sentiment",
"id": None,
"to_name": "text",
"hidden": False,
"read_only": False,
}
],
"was_canceled": False,
"score": 0.98,
}
],
}
]
def test_stage_with_annotation_for_ner():
element = NarrativeText(text="A big brown bear")
annotations = [
@ -209,6 +292,25 @@ def test_stage_with_annotation_raises_with_mismatched_lengths():
label_studio.stage_for_label_studio([element], [annotations, annotations])
def test_stage_with_prediction_raises_with_mismatched_lengths():
element = NarrativeText(text="A big brown bear")
prediction = [
label_studio.LabelStudioPrediction(
result=[
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
],
score=0.82,
)
]
with pytest.raises(ValueError):
label_studio.stage_for_label_studio([element], predictions=[prediction, prediction])
def test_stage_with_annotation_raises_with_invalid_type():
with pytest.raises(ValueError):
label_studio.LabelStudioResult(

View File

@ -1 +1 @@
__version__ = "0.2.1-dev4" # pragma: no cover
__version__ = "0.2.1-dev5" # pragma: no cover

View File

@ -98,9 +98,22 @@ class LabelStudioAnnotation:
return _annotation_dict
@dataclass
class LabelStudioPrediction(LabelStudioAnnotation):
score: float = 0
def __post_init__(self):
if not isinstance(self.score, (int, float)) or (self.score < 0 or self.score > 1):
raise ValueError(
f"{self.score} is not a valid score value. "
f"Score value must be a number between 0 and 1."
)
def stage_for_label_studio(
elements: List[Text],
annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
predictions: Optional[List[List[LabelStudioPrediction]]] = None,
text_field: str = "text",
id_field: str = "ref_id",
) -> LABEL_STUDIO_TYPE:
@ -109,6 +122,9 @@ def stage_for_label_studio(
if annotations is not None:
if len(elements) != len(annotations):
raise ValueError("The length of elements and annotations must match.")
if predictions is not None:
if len(elements) != len(predictions):
raise ValueError("The length of elements and predictions must match.")
label_studio_data: LABEL_STUDIO_TYPE = list()
for i, element in enumerate(elements):
@ -121,6 +137,8 @@ def stage_for_label_studio(
labeling_example["data"] = data
if annotations is not None:
labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
if predictions is not None:
labeling_example["predictions"] = [a.to_dict() for a in predictions[i]]
label_studio_data.append(labeling_example)
return label_studio_data