mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-04 12:03:15 +00:00
feat: Optionally include LabelStudio annotations in staging brick (#19)
* added types for label studio annotations * added method to cast as dicts * added length check for annotations * tweaks to get upload to work * added validation for label types * annotations is a list for each example * little bit of refactoring * test for staging with label studio * tests for error conditions and reviewers * added test for NER annotations * updated changelog and bumped version * added docs with annotation examples * fix label studio link * bump version in sphinx docs * fulle -> full (typo fix)
This commit is contained in:
parent
29607c32ba
commit
a950559b94
@ -1,7 +1,8 @@
|
|||||||
## 0.2.1-dev2
|
## 0.2.1-dev3
|
||||||
|
|
||||||
* Added staging brick for CSV format for Prodigy
|
* Added staging brick for CSV format for Prodigy
|
||||||
* Added staging brick for Prodigy
|
* Added staging brick for Prodigy
|
||||||
|
* Added ability to upload LabelStudio annotations
|
||||||
* Added text_field and id_field to stage_for_label_studio signature
|
* Added text_field and id_field to stage_for_label_studio signature
|
||||||
|
|
||||||
## 0.2.0
|
## 0.2.0
|
||||||
|
|||||||
@ -361,6 +361,98 @@ Examples:
|
|||||||
json.dump(label_studio_data, f, indent=4)
|
json.dump(label_studio_data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
You can also include pre-annotations as part of your LabelStudio upload. The
|
||||||
|
``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
|
||||||
|
annotations for each element in the ``elements`` list. If an element does not have any annotations,
|
||||||
|
use an empty list.
|
||||||
|
The following shows an example of how to upload annotations for the "Text Classification"
|
||||||
|
task in LabelStudio:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from unstructured.documents.elements import NarrativeText
|
||||||
|
from unstructured.staging.label_studio import (
|
||||||
|
stage_for_label_studio,
|
||||||
|
LabelStudioAnnotation,
|
||||||
|
LabelStudioResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
elements = [NarrativeText(text="Narrative")]
|
||||||
|
annotations = [[
|
||||||
|
LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
LabelStudioResult(
|
||||||
|
type="choices",
|
||||||
|
value={"choices": ["Positive"]},
|
||||||
|
from_name="sentiment",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]]
|
||||||
|
label_studio_data = stage_for_label_studio(
|
||||||
|
elements,
|
||||||
|
annotations=annotations,
|
||||||
|
text_field="my_text",
|
||||||
|
id_field="my_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
# The resulting JSON file is ready to be uploaded to LabelStudio
|
||||||
|
# with annotations included
|
||||||
|
with open("label_studio.json", "w") as f:
|
||||||
|
json.dump(label_studio_data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
The following shows an example of how to upload annotations for the "Named Entity Recognition"
|
||||||
|
task in LabelStudio:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from unstructured.documents.elements import NarrativeText
|
||||||
|
from unstructured.staging.label_studio import (
|
||||||
|
stage_for_label_studio,
|
||||||
|
LabelStudioAnnotation,
|
||||||
|
LabelStudioResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
elements = [NarrativeText(text="Narrative")]
|
||||||
|
annotations = [[
|
||||||
|
LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
LabelStudioResult(
|
||||||
|
type="labels",
|
||||||
|
value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]},
|
||||||
|
from_name="label",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]]
|
||||||
|
label_studio_data = stage_for_label_studio(
|
||||||
|
elements,
|
||||||
|
annotations=annotations,
|
||||||
|
text_field="my_text",
|
||||||
|
id_field="my_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
# The resulting JSON file is ready to be uploaded to LabelStudio
|
||||||
|
# with annotations included
|
||||||
|
with open("label_studio.json", "w") as f:
|
||||||
|
json.dump(label_studio_data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full list of options
|
||||||
|
for labels and annotations.
|
||||||
|
|
||||||
|
|
||||||
``stage_for_prodigy``
|
``stage_for_prodigy``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies'
|
|||||||
author = 'Unstructured Technologies'
|
author = 'Unstructured Technologies'
|
||||||
|
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = '0.0.1'
|
release = '0.2.1-dev3'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|||||||
@ -28,3 +28,175 @@ def test_specify_text_name(elements):
|
|||||||
def test_specify_id_name(elements):
|
def test_specify_id_name(elements):
|
||||||
label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id")
|
label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id")
|
||||||
assert "random_id" in label_studio_data[0]["data"]
|
assert "random_id" in label_studio_data[0]["data"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_created_annotation():
|
||||||
|
annotation = label_studio.LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
label_studio.LabelStudioResult(
|
||||||
|
type="choices",
|
||||||
|
value={"choices": ["Positive"]},
|
||||||
|
from_name="sentiment",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
annotation.to_dict() == {
|
||||||
|
"result": [
|
||||||
|
{
|
||||||
|
"type": "choices",
|
||||||
|
"value": {"choices": ["Positive"]},
|
||||||
|
"from_name": "sentiment",
|
||||||
|
"id": None,
|
||||||
|
"to_name": "text",
|
||||||
|
"hidden": False,
|
||||||
|
"read_only": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"was_canceled": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_stage_with_annotation():
|
||||||
|
element = NarrativeText(text="A big brown bear")
|
||||||
|
annotations = [
|
||||||
|
label_studio.LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
label_studio.LabelStudioResult(
|
||||||
|
type="choices",
|
||||||
|
value={"choices": ["Positive"]},
|
||||||
|
from_name="sentiment",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
|
||||||
|
assert label_studio_data == [
|
||||||
|
{
|
||||||
|
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||||
|
"annotations": [
|
||||||
|
{
|
||||||
|
"result": [
|
||||||
|
{
|
||||||
|
"type": "choices",
|
||||||
|
"value": {"choices": ["Positive"]},
|
||||||
|
"from_name": "sentiment",
|
||||||
|
"id": None,
|
||||||
|
"to_name": "text",
|
||||||
|
"hidden": False,
|
||||||
|
"read_only": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"was_canceled": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_stage_with_annotation_for_ner():
|
||||||
|
element = NarrativeText(text="A big brown bear")
|
||||||
|
annotations = [
|
||||||
|
label_studio.LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
label_studio.LabelStudioResult(
|
||||||
|
type="labels",
|
||||||
|
value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
|
||||||
|
from_name="label",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
|
||||||
|
assert label_studio_data == [
|
||||||
|
{
|
||||||
|
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||||
|
"annotations": [
|
||||||
|
{
|
||||||
|
"result": [
|
||||||
|
{
|
||||||
|
"type": "labels",
|
||||||
|
"value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]},
|
||||||
|
"from_name": "label",
|
||||||
|
"id": None,
|
||||||
|
"to_name": "text",
|
||||||
|
"hidden": False,
|
||||||
|
"read_only": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"was_canceled": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_stage_with_annotation_raises_with_mismatched_lengths():
|
||||||
|
element = NarrativeText(text="A big brown bear")
|
||||||
|
annotations = [
|
||||||
|
label_studio.LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
label_studio.LabelStudioResult(
|
||||||
|
type="choices",
|
||||||
|
value={"choices": ["Positive"]},
|
||||||
|
from_name="sentiment",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
label_studio.stage_for_label_studio([element], [annotations, annotations])
|
||||||
|
|
||||||
|
|
||||||
|
def test_stage_with_annotation_raises_with_invalid_type():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
label_studio.LabelStudioResult(
|
||||||
|
type="bears",
|
||||||
|
value={"bears": ["Positive"]},
|
||||||
|
from_name="sentiment",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stage_with_reviewed_annotation():
|
||||||
|
element = NarrativeText(text="A big brown bear")
|
||||||
|
annotations = [
|
||||||
|
label_studio.LabelStudioAnnotation(
|
||||||
|
result=[
|
||||||
|
label_studio.LabelStudioResult(
|
||||||
|
type="choices",
|
||||||
|
value={"choices": ["Positive"]},
|
||||||
|
from_name="sentiment",
|
||||||
|
to_name="text",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
|
||||||
|
assert label_studio_data == [
|
||||||
|
{
|
||||||
|
"data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
|
||||||
|
"annotations": [
|
||||||
|
{
|
||||||
|
"result": [
|
||||||
|
{
|
||||||
|
"type": "choices",
|
||||||
|
"value": {"choices": ["Positive"]},
|
||||||
|
"from_name": "sentiment",
|
||||||
|
"to_name": "text",
|
||||||
|
"id": None,
|
||||||
|
"hidden": False,
|
||||||
|
"read_only": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}],
|
||||||
|
"was_canceled": False,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.2.1-dev2" # pragma: no cover
|
__version__ = "0.2.1-dev3" # pragma: no cover
|
||||||
|
|||||||
@ -1,21 +1,126 @@
|
|||||||
from typing import Dict, List
|
from copy import deepcopy
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
from unstructured.documents.elements import Text
|
from unstructured.documents.elements import Text
|
||||||
|
|
||||||
|
|
||||||
LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]]
|
LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]]
|
||||||
|
|
||||||
|
# NOTE(robinson) - ref: https://labelstud.io/tags/labels.html
|
||||||
|
VALID_LABEL_TYPES = [
|
||||||
|
"labels",
|
||||||
|
"hypertextlabels",
|
||||||
|
"paragraphlabels",
|
||||||
|
"rectangle",
|
||||||
|
"keypoint",
|
||||||
|
"polygon",
|
||||||
|
"brush",
|
||||||
|
"ellipse",
|
||||||
|
"rectanglelabels",
|
||||||
|
"keypointlabels",
|
||||||
|
"polygonlabels",
|
||||||
|
"brushlabels",
|
||||||
|
"ellipselabels",
|
||||||
|
"timeserieslabels",
|
||||||
|
"choices",
|
||||||
|
"number",
|
||||||
|
"taxonomy",
|
||||||
|
"textarea",
|
||||||
|
"rating",
|
||||||
|
"pairwise",
|
||||||
|
"videorectangle",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LabelStudioResult:
|
||||||
|
"""Class for representing a LabelStudio annotation result.
|
||||||
|
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
|
||||||
|
|
||||||
|
type: str # The type of tag used to annotate the task
|
||||||
|
value: Dict[str, Any] # The values for
|
||||||
|
from_name: str # Name of the source object tag (i.e. "sentiment" for the sentiment template)
|
||||||
|
to_name: str # Name of the destination control tag
|
||||||
|
id: Optional[str] = None
|
||||||
|
hidden: bool = False
|
||||||
|
read_only: bool = False
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if self.type not in VALID_LABEL_TYPES:
|
||||||
|
raise ValueError(
|
||||||
|
f"{self.type} is not a valid label type. "
|
||||||
|
f"Valid label types are: {VALID_LABEL_TYPES}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return self.__dict__
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LabelStudioReview:
|
||||||
|
"""Class for representing a LablStudio review. Reviews are only available in the
|
||||||
|
Enterprise offering.
|
||||||
|
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
|
||||||
|
|
||||||
|
created_by: Dict[str, Union[str, int]]
|
||||||
|
accepted: bool
|
||||||
|
id: Optional[str] = None
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return self.__dict__
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LabelStudioAnnotation:
|
||||||
|
"""Class for representing LabelStudio annotations.
|
||||||
|
ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks"""
|
||||||
|
|
||||||
|
result: List[LabelStudioResult] # The result of the annotation
|
||||||
|
id: Optional[str] = None
|
||||||
|
lead_time: Optional[float] = None # Time in seconds to label the task
|
||||||
|
completed_by: Optional[int] = None # User ID for the user who completed the task
|
||||||
|
reviews: Optional[List[LabelStudioReview]] = None # An array of the review results
|
||||||
|
was_canceled: bool = False # Indicates whether or not the annotation was canceled
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
annotation_dict = deepcopy(self.__dict__)
|
||||||
|
annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]]
|
||||||
|
if "reviews" in annotation_dict and annotation_dict["reviews"] is not None:
|
||||||
|
annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]]
|
||||||
|
|
||||||
|
# NOTE(robinson) - Removes keys for any fields that defaulted to None
|
||||||
|
_annotation_dict = deepcopy(annotation_dict)
|
||||||
|
for key, value in annotation_dict.items():
|
||||||
|
if value is None:
|
||||||
|
_annotation_dict.pop(key)
|
||||||
|
|
||||||
|
return _annotation_dict
|
||||||
|
|
||||||
|
|
||||||
def stage_for_label_studio(
|
def stage_for_label_studio(
|
||||||
elements: List[Text], text_field: str = "text", id_field: str = "ref_id"
|
elements: List[Text],
|
||||||
|
annotations: Optional[List[List[LabelStudioAnnotation]]] = None,
|
||||||
|
text_field: str = "text",
|
||||||
|
id_field: str = "ref_id",
|
||||||
) -> LABEL_STUDIO_TYPE:
|
) -> LABEL_STUDIO_TYPE:
|
||||||
"""Converts the document to the format required for upload to LabelStudio.
|
"""Converts the document to the format required for upload to LabelStudio.
|
||||||
ref: https://labelstud.io/guide/tasks.html#Example-JSON-format"""
|
ref: https://labelstud.io/guide/tasks.html#Example-JSON-format"""
|
||||||
|
if annotations is not None:
|
||||||
|
if len(elements) != len(annotations):
|
||||||
|
raise ValueError("The length of elements and annotations must match.")
|
||||||
|
|
||||||
label_studio_data: LABEL_STUDIO_TYPE = list()
|
label_studio_data: LABEL_STUDIO_TYPE = list()
|
||||||
for element in elements:
|
for i, element in enumerate(elements):
|
||||||
data: Dict[str, str] = dict()
|
data: Dict[str, str] = dict()
|
||||||
data[text_field] = element.text
|
data[text_field] = element.text
|
||||||
if isinstance(element.id, str):
|
if isinstance(element.id, str):
|
||||||
data[id_field] = element.id
|
data[id_field] = element.id
|
||||||
label_studio_data.append({"data": data})
|
|
||||||
|
labeling_example: Dict[str, Any] = dict()
|
||||||
|
labeling_example["data"] = data
|
||||||
|
if annotations is not None:
|
||||||
|
labeling_example["annotations"] = [a.to_dict() for a in annotations[i]]
|
||||||
|
label_studio_data.append(labeling_example)
|
||||||
|
|
||||||
return label_studio_data
|
return label_studio_data
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user