feat: Optionally include LabelStudio annotations in staging brick (#19)

* added types for label studio annotations

* added method to cast as dicts

* added length check for annotations

* tweaks to get upload to work

* added validation for label types

* annotations is a list for each example

* little bit of refactoring

* test for staging with label studio

* tests for error conditions and reviewers

* added test for NER annotations

* updated changelog and bumped version

* added docs with annotation examples

* fix label studio link

* bump version in sphinx docs

* fulle -> full (typo fix)
This commit is contained in:
Matt Robinson 2022-10-04 09:25:05 -04:00 committed by GitHub
parent 29607c32ba
commit a950559b94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 377 additions and 7 deletions

View File

@ -1,7 +1,8 @@
## 0.2.1-dev2
## 0.2.1-dev3
* Added staging brick for CSV format for Prodigy
* Added staging brick for Prodigy
* Added ability to upload LabelStudio annotations
* Added text_field and id_field to stage_for_label_studio signature
## 0.2.0

View File

@ -361,6 +361,98 @@ Examples:
json.dump(label_studio_data, f, indent=4)
You can also include pre-annotations as part of your LabelStudio upload. The
``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
annotations for each element in the ``elements`` list. If an element does not have any annotations,
use an empty list.
The following shows an example of how to upload annotations for the "Text Classification"
task in LabelStudio:
.. code:: python
import json
from unstructured.documents.elements import NarrativeText
from unstructured.staging.label_studio import (
stage_for_label_studio,
LabelStudioAnnotation,
LabelStudioResult,
)
elements = [NarrativeText(text="Narrative")]
annotations = [[
LabelStudioAnnotation(
result=[
LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
]
)
]]
label_studio_data = stage_for_label_studio(
elements,
annotations=annotations,
text_field="my_text",
id_field="my_id"
)
# The resulting JSON file is ready to be uploaded to LabelStudio
# with annotations included
with open("label_studio.json", "w") as f:
json.dump(label_studio_data, f, indent=4)
The following shows an example of how to upload annotations for the "Named Entity Recognition"
task in LabelStudio:
.. code:: python
import json
from unstructured.documents.elements import NarrativeText
from unstructured.staging.label_studio import (
stage_for_label_studio,
LabelStudioAnnotation,
LabelStudioResult,
)
elements = [NarrativeText(text="Narrative")]
annotations = [[
LabelStudioAnnotation(
result=[
LabelStudioResult(
type="labels",
value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]},
from_name="label",
to_name="text",
)
]
)
]]
label_studio_data = stage_for_label_studio(
elements,
annotations=annotations,
text_field="my_text",
id_field="my_id"
)
# The resulting JSON file is ready to be uploaded to LabelStudio
# with annotations included
with open("label_studio.json", "w") as f:
json.dump(label_studio_data, f, indent=4)
See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full list of options
for labels and annotations.
``stage_for_prodigy``
--------------------------

View File

@ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies'
author = 'Unstructured Technologies'
# The full version, including alpha/beta/rc tags
release = '0.0.1'
release = '0.2.1-dev3'
# -- General configuration ---------------------------------------------------

View File

@ -28,3 +28,175 @@ def test_specify_text_name(elements):
def test_specify_id_name(elements):
label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id")
assert "random_id" in label_studio_data[0]["data"]
def test_created_annotation():
annotation = label_studio.LabelStudioAnnotation(
result=[
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
]
)
annotation.to_dict() == {
"result": [
{
"type": "choices",
"value": {"choices": ["Positive"]},
"from_name": "sentiment",
"id": None,
"to_name": "text",
"hidden": False,
"read_only": False,
}
],
"was_canceled": False,
}
def test_stage_with_annotation():
element = NarrativeText(text="A big brown bear")
annotations = [
label_studio.LabelStudioAnnotation(
result=[
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
]
)
]
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
assert label_studio_data == [
{
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
"annotations": [
{
"result": [
{
"type": "choices",
"value": {"choices": ["Positive"]},
"from_name": "sentiment",
"id": None,
"to_name": "text",
"hidden": False,
"read_only": False,
}
],
"was_canceled": False,
}
],
}
]
def test_stage_with_annotation_for_ner():
element = NarrativeText(text="A big brown bear")
annotations = [
label_studio.LabelStudioAnnotation(
result=[
label_studio.LabelStudioResult(
type="labels",
value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
from_name="label",
to_name="text",
)
]
)
]
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
assert label_studio_data == [
{
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
"annotations": [
{
"result": [
{
"type": "labels",
"value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
"from_name": "label",
"id": None,
"to_name": "text",
"hidden": False,
"read_only": False,
}
],
"was_canceled": False,
}
],
}
]
def test_stage_with_annotation_raises_with_mismatched_lengths():
element = NarrativeText(text="A big brown bear")
annotations = [
label_studio.LabelStudioAnnotation(
result=[
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
]
)
]
with pytest.raises(ValueError):
label_studio.stage_for_label_studio([element], [annotations, annotations])
def test_stage_with_annotation_raises_with_invalid_type():
with pytest.raises(ValueError):
label_studio.LabelStudioResult(
type="bears",
value={"bears": ["Positive"]},
from_name="sentiment",
to_name="text",
)
def test_stage_with_reviewed_annotation():
element = NarrativeText(text="A big brown bear")
annotations = [
label_studio.LabelStudioAnnotation(
result=[
label_studio.LabelStudioResult(
type="choices",
value={"choices": ["Positive"]},
from_name="sentiment",
to_name="text",
)
],
reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)],
)
]
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
assert label_studio_data == [
{
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
"annotations": [
{
"result": [
{
"type": "choices",
"value": {"choices": ["Positive"]},
"from_name": "sentiment",
"to_name": "text",
"id": None,
"hidden": False,
"read_only": False,
}
],
"reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}],
"was_canceled": False,
}
],
}
]

View File

@ -1 +1 @@
__version__ = "0.2.1-dev2" # pragma: no cover
__version__ = "0.2.1-dev3" # pragma: no cover

View File

@ -1,21 +1,126 @@
from typing import Dict, List
from copy import deepcopy
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union
from unstructured.documents.elements import Text
LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]]
# NOTE(robinson) - ref: https://labelstud.io/tags/labels.html
VALID_LABEL_TYPES = [
"labels",
"hypertextlabels",
"paragraphlabels",
"rectangle",
"keypoint",
"polygon",
"brush",
"ellipse",
"rectanglelabels",
"keypointlabels",
"polygonlabels",
"brushlabels",
"ellipselabels",
"timeserieslabels",
"choices",
"number",
"taxonomy",
"textarea",
"rating",
"pairwise",
"videorectangle",
]
@dataclass
class LabelStudioResult:
"""Class for representing a LabelStudio annotation result.
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
type: str # The type of tag used to annotate the task
value: Dict[str, Any] # The values for
from_name: str # Name of the source object tag (i.e. "sentiment" for the sentiment template)
to_name: str # Name of the destination control tag
id: Optional[str] = None
hidden: bool = False
read_only: bool = False
def __post_init__(self):
if self.type not in VALID_LABEL_TYPES:
raise ValueError(
f"{self.type} is not a valid label type. "
f"Valid label types are: {VALID_LABEL_TYPES}"
)
def to_dict(self):
return self.__dict__
@dataclass
class LabelStudioReview:
"""Class for representing a LablStudio review. Reviews are only available in the
Enterprise offering.
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
created_by: Dict[str, Union[str, int]]
accepted: bool
id: Optional[str] = None
def to_dict(self):
return self.__dict__
@dataclass
class LabelStudioAnnotation:
"""Class for representing LabelStudio annotations.
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
result: List[LabelStudioResult] # The result of the annotation
id: Optional[str] = None
lead_time: Optional[float] = None # Time in seconds to label the task
completed_by: Optional[int] = None # User ID for the user who completed the task
reviews: Optional[List[LabelStudioReview]] = None # An array of the review results
was_canceled: bool = False # Indicates whether or not the annotation was canceled
def to_dict(self):
annotation_dict = deepcopy(self.__dict__)
annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]]
if "reviews" in annotation_dict and annotation_dict["reviews"] is not None:
annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]]
# NOTE(robinson) - Removes keys for any fields that defaulted to None
_annotation_dict = deepcopy(annotation_dict)
for key, value in annotation_dict.items():
if value is None:
_annotation_dict.pop(key)
return _annotation_dict
def stage_for_label_studio(
elements: List[Text], text_field: str = "text", id_field: str = "ref_id"
elements: List[Text],
annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
text_field: str = "text",
id_field: str = "ref_id",
) -> LABEL_STUDIO_TYPE:
"""Converts the document to the format required for upload to LabelStudio.
ref: https://labelstud.io/guide/tasks.html#Example-JSON-format"""
if annotations is not None:
if len(elements) != len(annotations):
raise ValueError("The length of elements and annotations must match.")
label_studio_data: LABEL_STUDIO_TYPE = list()
for element in elements:
for i, element in enumerate(elements):
data: Dict[str, str] = dict()
data[text_field] = element.text
if isinstance(element.id, str):
data[id_field] = element.id
label_studio_data.append({"data": data})
labeling_example: Dict[str, Any] = dict()
labeling_example["data"] = data
if annotations is not None:
labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
label_studio_data.append(labeling_example)
return label_studio_data