From a950559b94511f7237e9ef48a7aaa1e98bfb399b Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Tue, 4 Oct 2022 09:25:05 -0400 Subject: [PATCH] feat: Optionally include LabelStudio annotations in staging brick (#19) * added types for label studio annotations * added method to cast as dicts * added length check for annotations * tweaks to get upload to work * added validation for label types * annotations is a list for each example * little bit of refactoring * test for staging with label studio * tests for error conditions and reviewers * added test for NER annotations * updated changelog and bumped version * added docs with annotation examples * fix label studio link * bump version in sphinx docs * fulle -> full (typo fix) --- CHANGELOG.md | 3 +- docs/source/bricks.rst | 92 ++++++++++ docs/source/conf.py | 2 +- .../staging/test_label_studio.py | 172 ++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/staging/label_studio.py | 113 +++++++++++- 6 files changed, 377 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fa5c51c5..ea0ecc4c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.2.1-dev2 +## 0.2.1-dev3 * Added staging brick for CSV format for Prodigy * Added staging brick for Prodigy +* Added ability to upload LabelStudio annotations * Added text_field and id_field to stage_for_label_studio signature ## 0.2.0 diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index fe2477299..a4678d7ce 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -361,6 +361,98 @@ Examples: json.dump(label_studio_data, f, indent=4) +You can also include pre-annotations as part of your LabelStudio upload. The +``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of +annotations for each element in the ``elements`` list. If an element does not have any annotations, +use an empty list. +The following shows an example of how to upload annotations for the "Text Classification" +task in LabelStudio: + +.. code:: python + + import json + + from unstructured.documents.elements import NarrativeText + from unstructured.staging.label_studio import ( + stage_for_label_studio, + LabelStudioAnnotation, + LabelStudioResult, + ) + + + + elements = [NarrativeText(text="Narrative")] + annotations = [[ + LabelStudioAnnotation( + result=[ + LabelStudioResult( + type="choices", + value={"choices": ["Positive"]}, + from_name="sentiment", + to_name="text", + ) + ] + ) + ]] + label_studio_data = stage_for_label_studio( + elements, + annotations=annotations, + text_field="my_text", + id_field="my_id" + ) + + # The resulting JSON file is ready to be uploaded to LabelStudio + # with annotations included + with open("label_studio.json", "w") as f: + json.dump(label_studio_data, f, indent=4) + + +The following shows an example of how to upload annotations for the "Named Entity Recognition" +task in LabelStudio: + +.. code:: python + + import json + + from unstructured.documents.elements import NarrativeText + from unstructured.staging.label_studio import ( + stage_for_label_studio, + LabelStudioAnnotation, + LabelStudioResult, + ) + + + + elements = [NarrativeText(text="Narrative")] + annotations = [[ + LabelStudioAnnotation( + result=[ + LabelStudioResult( + type="labels", + value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]}, + from_name="label", + to_name="text", + ) + ] + ) + ]] + label_studio_data = stage_for_label_studio( + elements, + annotations=annotations, + text_field="my_text", + id_field="my_id" + ) + + # The resulting JSON file is ready to be uploaded to LabelStudio + # with annotations included + with open("label_studio.json", "w") as f: + json.dump(label_studio_data, f, indent=4) + + +See the `LabelStudio docs `_ for a full list of options +for labels and annotations. + + ``stage_for_prodigy`` -------------------------- diff --git a/docs/source/conf.py b/docs/source/conf.py index 9663b10e7..067512695 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies' author = 'Unstructured Technologies' # The full version, including alpha/beta/rc tags -release = '0.0.1' +release = '0.2.1-dev3' # -- General configuration --------------------------------------------------- diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py index 644f1b132..1514ca3de 100644 --- a/test_unstructured/staging/test_label_studio.py +++ b/test_unstructured/staging/test_label_studio.py @@ -28,3 +28,175 @@ def test_specify_text_name(elements): def test_specify_id_name(elements): label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id") assert "random_id" in label_studio_data[0]["data"] + + +def test_created_annotation(): + annotation = label_studio.LabelStudioAnnotation( + result=[ + label_studio.LabelStudioResult( + type="choices", + value={"choices": ["Positive"]}, + from_name="sentiment", + to_name="text", + ) + ] + ) + + annotation.to_dict() == { + "result": [ + { + "type": "choices", + "value": {"choices": ["Positive"]}, + "from_name": "sentiment", + "id": None, + "to_name": "text", + "hidden": False, + "read_only": False, + } + ], + "was_canceled": False, + } + + +def test_stage_with_annotation(): + element = NarrativeText(text="A big brown bear") + annotations = [ + label_studio.LabelStudioAnnotation( + result=[ + label_studio.LabelStudioResult( + type="choices", + value={"choices": ["Positive"]}, + from_name="sentiment", + to_name="text", + ) + ] + ) + ] + label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) + assert label_studio_data == [ + { + "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, + "annotations": [ + { + "result": [ + { + "type": "choices", + "value": {"choices": ["Positive"]}, + "from_name": "sentiment", + "id": None, + "to_name": "text", + "hidden": False, + "read_only": False, + } + ], + "was_canceled": False, + } + ], + } + ] + + +def test_stage_with_annotation_for_ner(): + element = NarrativeText(text="A big brown bear") + annotations = [ + label_studio.LabelStudioAnnotation( + result=[ + label_studio.LabelStudioResult( + type="labels", + value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]}, + from_name="label", + to_name="text", + ) + ] + ) + ] + label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) + assert label_studio_data == [ + { + "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, + "annotations": [ + { + "result": [ + { + "type": "labels", + "value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]}, + "from_name": "label", + "id": None, + "to_name": "text", + "hidden": False, + "read_only": False, + } + ], + "was_canceled": False, + } + ], + } + ] + + +def test_stage_with_annotation_raises_with_mismatched_lengths(): + element = NarrativeText(text="A big brown bear") + annotations = [ + label_studio.LabelStudioAnnotation( + result=[ + label_studio.LabelStudioResult( + type="choices", + value={"choices": ["Positive"]}, + from_name="sentiment", + to_name="text", + ) + ] + ) + ] + with pytest.raises(ValueError): + label_studio.stage_for_label_studio([element], [annotations, annotations]) + + +def test_stage_with_annotation_raises_with_invalid_type(): + with pytest.raises(ValueError): + label_studio.LabelStudioResult( + type="bears", + value={"bears": ["Positive"]}, + from_name="sentiment", + to_name="text", + ) + + +def test_stage_with_reviewed_annotation(): + element = NarrativeText(text="A big brown bear") + annotations = [ + label_studio.LabelStudioAnnotation( + result=[ + label_studio.LabelStudioResult( + type="choices", + value={"choices": ["Positive"]}, + from_name="sentiment", + to_name="text", + ) + ], + reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)], + ) + ] + label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) + assert label_studio_data == [ + { + "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, + "annotations": [ + { + "result": [ + { + "type": "choices", + "value": {"choices": ["Positive"]}, + "from_name": "sentiment", + "to_name": "text", + "id": None, + "hidden": False, + "read_only": False, + } + ], + "reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}], + "was_canceled": False, + } + ], + } + ] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 20b14eae9..3cf7ddb67 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.1-dev2" # pragma: no cover +__version__ = "0.2.1-dev3" # pragma: no cover diff --git a/unstructured/staging/label_studio.py b/unstructured/staging/label_studio.py index 1e83633ac..8927f4bd9 100644 --- a/unstructured/staging/label_studio.py +++ b/unstructured/staging/label_studio.py @@ -1,21 +1,126 @@ -from typing import Dict, List +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union from unstructured.documents.elements import Text LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] +# NOTE(robinson) - ref: https://labelstud.io/tags/labels.html +VALID_LABEL_TYPES = [ + "labels", + "hypertextlabels", + "paragraphlabels", + "rectangle", + "keypoint", + "polygon", + "brush", + "ellipse", + "rectanglelabels", + "keypointlabels", + "polygonlabels", + "brushlabels", + "ellipselabels", + "timeserieslabels", + "choices", + "number", + "taxonomy", + "textarea", + "rating", + "pairwise", + "videorectangle", +] + + +@dataclass +class LabelStudioResult: + """Class for representing a LabelStudio annotation result. + ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" + + type: str # The type of tag used to annotate the task + value: Dict[str, Any] # The values for + from_name: str # Name of the source object tag (i.e. "sentiment" for the sentiment template) + to_name: str # Name of the destination control tag + id: Optional[str] = None + hidden: bool = False + read_only: bool = False + + def __post_init__(self): + if self.type not in VALID_LABEL_TYPES: + raise ValueError( + f"{self.type} is not a valid label type. " + f"Valid label types are: {VALID_LABEL_TYPES}" + ) + + def to_dict(self): + return self.__dict__ + + +@dataclass +class LabelStudioReview: + """Class for representing a LablStudio review. Reviews are only available in the + Enterprise offering. + ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" + + created_by: Dict[str, Union[str, int]] + accepted: bool + id: Optional[str] = None + + def to_dict(self): + return self.__dict__ + + +@dataclass +class LabelStudioAnnotation: + """Class for representing LabelStudio annotations. + ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" + + result: List[LabelStudioResult] # The result of the annotation + id: Optional[str] = None + lead_time: Optional[float] = None # Time in seconds to label the task + completed_by: Optional[int] = None # User ID for the user who completed the task + reviews: Optional[List[LabelStudioReview]] = None # An array of the review results + was_canceled: bool = False # Indicates whether or not the annotation was canceled + + def to_dict(self): + annotation_dict = deepcopy(self.__dict__) + annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]] + if "reviews" in annotation_dict and annotation_dict["reviews"] is not None: + annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]] + + # NOTE(robinson) - Removes keys for any fields that defaulted to None + _annotation_dict = deepcopy(annotation_dict) + for key, value in annotation_dict.items(): + if value is None: + _annotation_dict.pop(key) + + return _annotation_dict + def stage_for_label_studio( - elements: List[Text], text_field: str = "text", id_field: str = "ref_id" + elements: List[Text], + annotations: Optional[List[List[LabelStudioAnnotation]]] = None, + text_field: str = "text", + id_field: str = "ref_id", ) -> LABEL_STUDIO_TYPE: """Converts the document to the format required for upload to LabelStudio. ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" + if annotations is not None: + if len(elements) != len(annotations): + raise ValueError("The length of elements and annotations must match.") + label_studio_data: LABEL_STUDIO_TYPE = list() - for element in elements: + for i, element in enumerate(elements): data: Dict[str, str] = dict() data[text_field] = element.text if isinstance(element.id, str): data[id_field] = element.id - label_studio_data.append({"data": data}) + + labeling_example: Dict[str, Any] = dict() + labeling_example["data"] = data + if annotations is not None: + labeling_example["annotations"] = [a.to_dict() for a in annotations[i]] + label_studio_data.append(labeling_example) + return label_studio_data