mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-29 09:00:30 +00:00
feat: Optionally include LabelStudio annotations in staging brick (#19)
* added types for label studio annotations * added method to cast as dicts * added length check for annotations * tweaks to get upload to work * added validation for label types * annotations is a list for each example * little bit of refactoring * test for staging with label studio * tests for error conditions and reviewers * added test for NER annotations * updated changelog and bumped version * added docs with annotation examples * fix label studio link * bump version in sphinx docs * fulle -> full (typo fix)
This commit is contained in:
parent
29607c32ba
commit
a950559b94
@ -1,7 +1,8 @@
|
||||
## 0.2.1-dev2
|
||||
## 0.2.1-dev3
|
||||
|
||||
* Added staging brick for CSV format for Prodigy
|
||||
* Added staging brick for Prodigy
|
||||
* Added ability to upload LabelStudio annotations
|
||||
* Added text_field and id_field to stage_for_label_studio signature
|
||||
|
||||
## 0.2.0
|
||||
|
||||
@ -361,6 +361,98 @@ Examples:
|
||||
json.dump(label_studio_data, f, indent=4)
|
||||
|
||||
|
||||
You can also include pre-annotations as part of your LabelStudio upload. The
|
||||
``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
|
||||
annotations for each element in the ``elements`` list. If an element does not have any annotations,
|
||||
use an empty list.
|
||||
The following shows an example of how to upload annotations for the "Text Classification"
|
||||
task in LabelStudio:
|
||||
|
||||
.. code:: python
|
||||
|
||||
import json
|
||||
|
||||
from unstructured.documents.elements import NarrativeText
|
||||
from unstructured.staging.label_studio import (
|
||||
stage_for_label_studio,
|
||||
LabelStudioAnnotation,
|
||||
LabelStudioResult,
|
||||
)
|
||||
|
||||
|
||||
|
||||
elements = [NarrativeText(text="Narrative")]
|
||||
annotations = [[
|
||||
LabelStudioAnnotation(
|
||||
result=[
|
||||
LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
)
|
||||
]]
|
||||
label_studio_data = stage_for_label_studio(
|
||||
elements,
|
||||
annotations=annotations,
|
||||
text_field="my_text",
|
||||
id_field="my_id"
|
||||
)
|
||||
|
||||
# The resulting JSON file is ready to be uploaded to LabelStudio
|
||||
# with annotations included
|
||||
with open("label_studio.json", "w") as f:
|
||||
json.dump(label_studio_data, f, indent=4)
|
||||
|
||||
|
||||
The following shows an example of how to upload annotations for the "Named Entity Recognition"
|
||||
task in LabelStudio:
|
||||
|
||||
.. code:: python
|
||||
|
||||
import json
|
||||
|
||||
from unstructured.documents.elements import NarrativeText
|
||||
from unstructured.staging.label_studio import (
|
||||
stage_for_label_studio,
|
||||
LabelStudioAnnotation,
|
||||
LabelStudioResult,
|
||||
)
|
||||
|
||||
|
||||
|
||||
elements = [NarrativeText(text="Narrative")]
|
||||
annotations = [[
|
||||
LabelStudioAnnotation(
|
||||
result=[
|
||||
LabelStudioResult(
|
||||
type="labels",
|
||||
value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]},
|
||||
from_name="label",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
)
|
||||
]]
|
||||
label_studio_data = stage_for_label_studio(
|
||||
elements,
|
||||
annotations=annotations,
|
||||
text_field="my_text",
|
||||
id_field="my_id"
|
||||
)
|
||||
|
||||
# The resulting JSON file is ready to be uploaded to LabelStudio
|
||||
# with annotations included
|
||||
with open("label_studio.json", "w") as f:
|
||||
json.dump(label_studio_data, f, indent=4)
|
||||
|
||||
|
||||
See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full list of options
|
||||
for labels and annotations.
|
||||
|
||||
|
||||
``stage_for_prodigy``
|
||||
--------------------------
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies'
|
||||
author = 'Unstructured Technologies'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.0.1'
|
||||
release = '0.2.1-dev3'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
@ -28,3 +28,175 @@ def test_specify_text_name(elements):
|
||||
def test_specify_id_name(elements):
|
||||
label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id")
|
||||
assert "random_id" in label_studio_data[0]["data"]
|
||||
|
||||
|
||||
def test_created_annotation():
|
||||
annotation = label_studio.LabelStudioAnnotation(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
annotation.to_dict() == {
|
||||
"result": [
|
||||
{
|
||||
"type": "choices",
|
||||
"value": {"choices": ["Positive"]},
|
||||
"from_name": "sentiment",
|
||||
"id": None,
|
||||
"to_name": "text",
|
||||
"hidden": False,
|
||||
"read_only": False,
|
||||
}
|
||||
],
|
||||
"was_canceled": False,
|
||||
}
|
||||
|
||||
|
||||
def test_stage_with_annotation():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
annotations = [
|
||||
label_studio.LabelStudioAnnotation(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
)
|
||||
]
|
||||
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
|
||||
assert label_studio_data == [
|
||||
{
|
||||
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||
"annotations": [
|
||||
{
|
||||
"result": [
|
||||
{
|
||||
"type": "choices",
|
||||
"value": {"choices": ["Positive"]},
|
||||
"from_name": "sentiment",
|
||||
"id": None,
|
||||
"to_name": "text",
|
||||
"hidden": False,
|
||||
"read_only": False,
|
||||
}
|
||||
],
|
||||
"was_canceled": False,
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_stage_with_annotation_for_ner():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
annotations = [
|
||||
label_studio.LabelStudioAnnotation(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="labels",
|
||||
value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
|
||||
from_name="label",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
)
|
||||
]
|
||||
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
|
||||
assert label_studio_data == [
|
||||
{
|
||||
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||
"annotations": [
|
||||
{
|
||||
"result": [
|
||||
{
|
||||
"type": "labels",
|
||||
"value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
|
||||
"from_name": "label",
|
||||
"id": None,
|
||||
"to_name": "text",
|
||||
"hidden": False,
|
||||
"read_only": False,
|
||||
}
|
||||
],
|
||||
"was_canceled": False,
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_stage_with_annotation_raises_with_mismatched_lengths():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
annotations = [
|
||||
label_studio.LabelStudioAnnotation(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
]
|
||||
)
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
label_studio.stage_for_label_studio([element], [annotations, annotations])
|
||||
|
||||
|
||||
def test_stage_with_annotation_raises_with_invalid_type():
|
||||
with pytest.raises(ValueError):
|
||||
label_studio.LabelStudioResult(
|
||||
type="bears",
|
||||
value={"bears": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
|
||||
|
||||
def test_stage_with_reviewed_annotation():
|
||||
element = NarrativeText(text="A big brown bear")
|
||||
annotations = [
|
||||
label_studio.LabelStudioAnnotation(
|
||||
result=[
|
||||
label_studio.LabelStudioResult(
|
||||
type="choices",
|
||||
value={"choices": ["Positive"]},
|
||||
from_name="sentiment",
|
||||
to_name="text",
|
||||
)
|
||||
],
|
||||
reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)],
|
||||
)
|
||||
]
|
||||
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
|
||||
assert label_studio_data == [
|
||||
{
|
||||
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||
"annotations": [
|
||||
{
|
||||
"result": [
|
||||
{
|
||||
"type": "choices",
|
||||
"value": {"choices": ["Positive"]},
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"id": None,
|
||||
"hidden": False,
|
||||
"read_only": False,
|
||||
}
|
||||
],
|
||||
"reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}],
|
||||
"was_canceled": False,
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.2.1-dev2" # pragma: no cover
|
||||
__version__ = "0.2.1-dev3" # pragma: no cover
|
||||
|
||||
@ -1,21 +1,126 @@
|
||||
from typing import Dict, List
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
|
||||
|
||||
LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]]
|
||||
|
||||
# NOTE(robinson) - ref: https://labelstud.io/tags/labels.html
|
||||
VALID_LABEL_TYPES = [
|
||||
"labels",
|
||||
"hypertextlabels",
|
||||
"paragraphlabels",
|
||||
"rectangle",
|
||||
"keypoint",
|
||||
"polygon",
|
||||
"brush",
|
||||
"ellipse",
|
||||
"rectanglelabels",
|
||||
"keypointlabels",
|
||||
"polygonlabels",
|
||||
"brushlabels",
|
||||
"ellipselabels",
|
||||
"timeserieslabels",
|
||||
"choices",
|
||||
"number",
|
||||
"taxonomy",
|
||||
"textarea",
|
||||
"rating",
|
||||
"pairwise",
|
||||
"videorectangle",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelStudioResult:
|
||||
"""Class for representing a LabelStudio annotation result.
|
||||
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
|
||||
|
||||
type: str # The type of tag used to annotate the task
|
||||
value: Dict[str, Any] # The values for
|
||||
from_name: str # Name of the source object tag (i.e. "sentiment" for the sentiment template)
|
||||
to_name: str # Name of the destination control tag
|
||||
id: Optional[str] = None
|
||||
hidden: bool = False
|
||||
read_only: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if self.type not in VALID_LABEL_TYPES:
|
||||
raise ValueError(
|
||||
f"{self.type} is not a valid label type. "
|
||||
f"Valid label types are: {VALID_LABEL_TYPES}"
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return self.__dict__
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelStudioReview:
|
||||
"""Class for representing a LablStudio review. Reviews are only available in the
|
||||
Enterprise offering.
|
||||
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
|
||||
|
||||
created_by: Dict[str, Union[str, int]]
|
||||
accepted: bool
|
||||
id: Optional[str] = None
|
||||
|
||||
def to_dict(self):
|
||||
return self.__dict__
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelStudioAnnotation:
|
||||
"""Class for representing LabelStudio annotations.
|
||||
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
|
||||
|
||||
result: List[LabelStudioResult] # The result of the annotation
|
||||
id: Optional[str] = None
|
||||
lead_time: Optional[float] = None # Time in seconds to label the task
|
||||
completed_by: Optional[int] = None # User ID for the user who completed the task
|
||||
reviews: Optional[List[LabelStudioReview]] = None # An array of the review results
|
||||
was_canceled: bool = False # Indicates whether or not the annotation was canceled
|
||||
|
||||
def to_dict(self):
|
||||
annotation_dict = deepcopy(self.__dict__)
|
||||
annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]]
|
||||
if "reviews" in annotation_dict and annotation_dict["reviews"] is not None:
|
||||
annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]]
|
||||
|
||||
# NOTE(robinson) - Removes keys for any fields that defaulted to None
|
||||
_annotation_dict = deepcopy(annotation_dict)
|
||||
for key, value in annotation_dict.items():
|
||||
if value is None:
|
||||
_annotation_dict.pop(key)
|
||||
|
||||
return _annotation_dict
|
||||
|
||||
|
||||
def stage_for_label_studio(
|
||||
elements: List[Text], text_field: str = "text", id_field: str = "ref_id"
|
||||
elements: List[Text],
|
||||
annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
|
||||
text_field: str = "text",
|
||||
id_field: str = "ref_id",
|
||||
) -> LABEL_STUDIO_TYPE:
|
||||
"""Converts the document to the format required for upload to LabelStudio.
|
||||
ref: https://labelstud.io/guide/tasks.html#Example-JSON-format"""
|
||||
if annotations is not None:
|
||||
if len(elements) != len(annotations):
|
||||
raise ValueError("The length of elements and annotations must match.")
|
||||
|
||||
label_studio_data: LABEL_STUDIO_TYPE = list()
|
||||
for element in elements:
|
||||
for i, element in enumerate(elements):
|
||||
data: Dict[str, str] = dict()
|
||||
data[text_field] = element.text
|
||||
if isinstance(element.id, str):
|
||||
data[id_field] = element.id
|
||||
label_studio_data.append({"data": data})
|
||||
|
||||
labeling_example: Dict[str, Any] = dict()
|
||||
labeling_example["data"] = data
|
||||
if annotations is not None:
|
||||
labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
|
||||
label_studio_data.append(labeling_example)
|
||||
|
||||
return label_studio_data
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user