mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 05:34:58 +00:00
feat: Allow option to specify predictions in LabelStudio staging brick (#23)
* Allow stage_for_label_studio to take a predictions input and implement prediction class * Update unit tests for LabelStudioPrediction and stage_for_label_studio function * Update stage_for_label_studio docs with example of loading predictions * Bump version and update changelog Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
779e48bafe
commit
baba641d03
@ -1,5 +1,6 @@
|
||||
## 0.2.1-dev4
|
||||
## 0.2.1-dev5
|
||||
|
||||
* Added ability to upload LabelStudio predictions
|
||||
* Added utility function for JSONL reading and writing
|
||||
* Added staging brick for CSV format for Prodigy
|
||||
* Added staging brick for Prodigy
|
||||
|
||||
@ -361,8 +361,9 @@ Examples:
|
||||
json.dump(label_studio_data, f, indent=4)
|
||||
|
||||
|
||||
You can also include pre-annotations as part of your LabelStudio upload. The
|
||||
``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
|
||||
You can also include pre-annotations and predictions as part of your LabelStudio upload.
|
||||
|
||||
The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
|
||||
annotations for each element in the ``elements`` list. If an element does not have any annotations,
|
||||
use an empty list.
|
||||
The following shows an example of how to upload annotations for the "Text Classification"
|
||||
@ -407,6 +408,52 @@ task in LabelStudio:
|
||||
json.dump(label_studio_data, f, indent=4)
|
||||
|
||||
|
||||
Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
|
||||
the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
|
||||
predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list.
|
||||
The following shows an example of how to upload predictions for the "Text Classification"
|
||||
task in LabelStudio:
|
||||
|
||||
.. code:: python
|
||||
|
||||
import json
|
||||
|
||||
from unstructured.documents.elements import NarrativeText
|
||||
from unstructured.staging.label_studio import (
|
||||
stage_for_label_studio,
|
||||
LabelStudioPrediction,
|
||||
LabelStudioResult,
|
||||
)
|
||||
|
||||
|
||||
|
||||
elements = [NarrativeText(text="Narrative")]
|
||||
predictions = [[
|
||||
LabelStudioPrediction(
|
||||
result=[
|
||||
LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
],
|
||||
score=0.68
|
||||
)
|
||||
]]
|
||||
label_studio_data = stage_for_label_studio(
|
||||
elements,
|
||||
predictions=predictions,
|
||||
text_field="my_text",
|
||||
id_field="my_id"
|
||||
)
|
||||
|
||||
# The resulting JSON file is ready to be uploaded to LabelStudio
|
||||
# with annotations included
|
||||
with open("label_studio.json", "w") as f:
|
||||
json.dump(label_studio_data, f, indent=4)
|
||||
|
||||
|
||||
The following shows an example of how to upload annotations for the "Named Entity Recognition"
|
||||
task in LabelStudio:
|
||||
|
||||
|
||||
@ -115,6 +115,49 @@ def test_created_annotation():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"score, raises, exception",
|
||||
[
|
||||
(None, True, ValueError),
|
||||
(-0.25, True, ValueError),
|
||||
(0, False, None),
|
||||
(0.5, False, None),
|
||||
(1, False, None),
|
||||
(1.25, True, ValueError),
|
||||
],
|
||||
)
|
||||
def test_init_prediction(score, raises, exception):
|
||||
result = [
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
|
||||
if raises:
|
||||
with pytest.raises(exception):
|
||||
label_studio.LabelStudioPrediction(result=result, score=score)
|
||||
else:
|
||||
prediction = label_studio.LabelStudioPrediction(result=result, score=score)
|
||||
prediction.to_dict() == {
|
||||
"result": [
|
||||
{
|
||||
"type": "choices",
|
||||
"value": {"choices": ["Positive"]},
|
||||
"from_name": "sentiment",
|
||||
"id": None,
|
||||
"to_name": "text",
|
||||
"hidden": False,
|
||||
"read_only": False,
|
||||
}
|
||||
],
|
||||
"was_canceled": False,
|
||||
"score": score,
|
||||
}
|
||||
|
||||
|
||||
def test_stage_with_annotation():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
annotations = [
|
||||
@ -153,6 +196,46 @@ def test_stage_with_annotation():
|
||||
]
|
||||
|
||||
|
||||
def test_stage_with_prediction():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
prediction = [
|
||||
label_studio.LabelStudioPrediction(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
],
|
||||
score=0.98,
|
||||
)
|
||||
]
|
||||
label_studio_data = label_studio.stage_for_label_studio([element], predictions=[prediction])
|
||||
assert label_studio_data == [
|
||||
{
|
||||
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||
"predictions": [
|
||||
{
|
||||
"result": [
|
||||
{
|
||||
"type": "choices",
|
||||
"value": {"choices": ["Positive"]},
|
||||
"from_name": "sentiment",
|
||||
"id": None,
|
||||
"to_name": "text",
|
||||
"hidden": False,
|
||||
"read_only": False,
|
||||
}
|
||||
],
|
||||
"was_canceled": False,
|
||||
"score": 0.98,
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_stage_with_annotation_for_ner():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
annotations = [
|
||||
@ -209,6 +292,25 @@ def test_stage_with_annotation_raises_with_mismatched_lengths():
|
||||
label_studio.stage_for_label_studio([element], [annotations, annotations])
|
||||
|
||||
|
||||
def test_stage_with_prediction_raises_with_mismatched_lengths():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
prediction = [
|
||||
label_studio.LabelStudioPrediction(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
],
|
||||
score=0.82,
|
||||
)
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
label_studio.stage_for_label_studio([element], predictions=[prediction, prediction])
|
||||
|
||||
|
||||
def test_stage_with_annotation_raises_with_invalid_type():
|
||||
with pytest.raises(ValueError):
|
||||
label_studio.LabelStudioResult(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.2.1-dev4" # pragma: no cover
|
||||
__version__ = "0.2.1-dev5" # pragma: no cover
|
||||
|
||||
@ -98,9 +98,22 @@ class LabelStudioAnnotation:
|
||||
return _annotation_dict
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelStudioPrediction(LabelStudioAnnotation):
|
||||
score: float = 0
|
||||
|
||||
def __post_init__(self):
|
||||
if not isinstance(self.score, (int, float)) or (self.score < 0 or self.score > 1):
|
||||
raise ValueError(
|
||||
f"{self.score} is not a valid score value. "
|
||||
f"Score value must be a number between 0 and 1."
|
||||
)
|
||||
|
||||
|
||||
def stage_for_label_studio(
|
||||
elements: List[Text],
|
||||
annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
|
||||
predictions: Optional[List[List[LabelStudioPrediction]]] = None,
|
||||
text_field: str = "text",
|
||||
id_field: str = "ref_id",
|
||||
) -> LABEL_STUDIO_TYPE:
|
||||
@ -109,6 +122,9 @@ def stage_for_label_studio(
|
||||
if annotations is not None:
|
||||
if len(elements) != len(annotations):
|
||||
raise ValueError("The length of elements and annotations must match.")
|
||||
if predictions is not None:
|
||||
if len(elements) != len(predictions):
|
||||
raise ValueError("The length of elements and predictions must match.")
|
||||
|
||||
label_studio_data: LABEL_STUDIO_TYPE = list()
|
||||
for i, element in enumerate(elements):
|
||||
@ -121,6 +137,8 @@ def stage_for_label_studio(
|
||||
labeling_example["data"] = data
|
||||
if annotations is not None:
|
||||
labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
|
||||
if predictions is not None:
|
||||
labeling_example["predictions"] = [a.to_dict() for a in predictions[i]]
|
||||
label_studio_data.append(labeling_example)
|
||||
|
||||
return label_studio_data
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user