mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	feat: Optionally include LabelStudio annotations in staging brick (#19)
* added types for label studio annotations * added method to cast as dicts * added length check for annotations * tweaks to get upload to work * added validation for label types * annotations is a list for each example * little bit of refactoring * test for staging with label studio * tests for error conditions and reviewers * added test for NER annotations * updated changelog and bumped version * added docs with annotation examples * fix label studio link * bump version in sphinx docs * fulle -> full (typo fix)
This commit is contained in:
		
							parent
							
								
									29607c32ba
								
							
						
					
					
						commit
						a950559b94
					
				| @ -1,7 +1,8 @@ | |||||||
| ## 0.2.1-dev2 | ## 0.2.1-dev3 | ||||||
| 
 | 
 | ||||||
| * Added staging brick for CSV format for Prodigy | * Added staging brick for CSV format for Prodigy | ||||||
| * Added staging brick for Prodigy | * Added staging brick for Prodigy | ||||||
|  | * Added ability to upload LabelStudio annotations | ||||||
| * Added text_field and id_field to stage_for_label_studio signature | * Added text_field and id_field to stage_for_label_studio signature | ||||||
| 
 | 
 | ||||||
| ## 0.2.0 | ## 0.2.0 | ||||||
|  | |||||||
| @ -361,6 +361,98 @@ Examples: | |||||||
|       json.dump(label_studio_data, f, indent=4) |       json.dump(label_studio_data, f, indent=4) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | You can also include pre-annotations as part of your LabelStudio upload. The | ||||||
|  | ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of | ||||||
|  | annotations for each element in the ``elements`` list. If an element does not have any annotations, | ||||||
|  | use an empty list. | ||||||
|  | The following shows an example of how to upload annotations for the "Text Classification" | ||||||
|  | task in LabelStudio: | ||||||
|  | 
 | ||||||
|  | .. code:: python | ||||||
|  | 
 | ||||||
|  |   import json | ||||||
|  | 
 | ||||||
|  |   from unstructured.documents.elements import NarrativeText | ||||||
|  |   from unstructured.staging.label_studio import ( | ||||||
|  |       stage_for_label_studio, | ||||||
|  |       LabelStudioAnnotation, | ||||||
|  |       LabelStudioResult, | ||||||
|  |   ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |   elements = [NarrativeText(text="Narrative")] | ||||||
|  |   annotations = [[ | ||||||
|  |     LabelStudioAnnotation( | ||||||
|  |         result=[ | ||||||
|  |             LabelStudioResult( | ||||||
|  |                 type="choices", | ||||||
|  |                 value={"choices": ["Positive"]}, | ||||||
|  |                 from_name="sentiment", | ||||||
|  |                 to_name="text", | ||||||
|  |             ) | ||||||
|  |         ] | ||||||
|  |     ) | ||||||
|  |   ]] | ||||||
|  |   label_studio_data = stage_for_label_studio( | ||||||
|  |       elements, | ||||||
|  |       annotations=annotations, | ||||||
|  |       text_field="my_text", | ||||||
|  |       id_field="my_id" | ||||||
|  |   ) | ||||||
|  | 
 | ||||||
|  |   # The resulting JSON file is ready to be uploaded to LabelStudio | ||||||
|  |   # with annotations included | ||||||
|  |   with open("label_studio.json", "w") as f: | ||||||
|  |       json.dump(label_studio_data, f, indent=4) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | The following shows an example of how to upload annotations for the "Named Entity Recognition" | ||||||
|  | task in LabelStudio: | ||||||
|  | 
 | ||||||
|  | .. code:: python | ||||||
|  | 
 | ||||||
|  |   import json | ||||||
|  | 
 | ||||||
|  |   from unstructured.documents.elements import NarrativeText | ||||||
|  |   from unstructured.staging.label_studio import ( | ||||||
|  |       stage_for_label_studio, | ||||||
|  |       LabelStudioAnnotation, | ||||||
|  |       LabelStudioResult, | ||||||
|  |   ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |   elements = [NarrativeText(text="Narrative")] | ||||||
|  |   annotations = [[ | ||||||
|  |     LabelStudioAnnotation( | ||||||
|  |         result=[ | ||||||
|  |             LabelStudioResult( | ||||||
|  |                 type="labels", | ||||||
|  |                 value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]}, | ||||||
|  |                 from_name="label", | ||||||
|  |                 to_name="text", | ||||||
|  |             ) | ||||||
|  |         ] | ||||||
|  |     ) | ||||||
|  |   ]] | ||||||
|  |   label_studio_data = stage_for_label_studio( | ||||||
|  |       elements, | ||||||
|  |       annotations=annotations, | ||||||
|  |       text_field="my_text", | ||||||
|  |       id_field="my_id" | ||||||
|  |   ) | ||||||
|  | 
 | ||||||
|  |   # The resulting JSON file is ready to be uploaded to LabelStudio | ||||||
|  |   # with annotations included | ||||||
|  |   with open("label_studio.json", "w") as f: | ||||||
|  |       json.dump(label_studio_data, f, indent=4) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full list of options | ||||||
|  | for labels and annotations. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| ``stage_for_prodigy`` | ``stage_for_prodigy`` | ||||||
| -------------------------- | -------------------------- | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies' | |||||||
| author = 'Unstructured Technologies' | author = 'Unstructured Technologies' | ||||||
| 
 | 
 | ||||||
| # The full version, including alpha/beta/rc tags | # The full version, including alpha/beta/rc tags | ||||||
| release = '0.0.1' | release = '0.2.1-dev3' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # -- General configuration --------------------------------------------------- | # -- General configuration --------------------------------------------------- | ||||||
|  | |||||||
| @ -28,3 +28,175 @@ def test_specify_text_name(elements): | |||||||
| def test_specify_id_name(elements): | def test_specify_id_name(elements): | ||||||
|     label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id") |     label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id") | ||||||
|     assert "random_id" in label_studio_data[0]["data"] |     assert "random_id" in label_studio_data[0]["data"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_created_annotation(): | ||||||
|  |     annotation = label_studio.LabelStudioAnnotation( | ||||||
|  |         result=[ | ||||||
|  |             label_studio.LabelStudioResult( | ||||||
|  |                 type="choices", | ||||||
|  |                 value={"choices": ["Positive"]}, | ||||||
|  |                 from_name="sentiment", | ||||||
|  |                 to_name="text", | ||||||
|  |             ) | ||||||
|  |         ] | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     annotation.to_dict() == { | ||||||
|  |         "result": [ | ||||||
|  |             { | ||||||
|  |                 "type": "choices", | ||||||
|  |                 "value": {"choices": ["Positive"]}, | ||||||
|  |                 "from_name": "sentiment", | ||||||
|  |                 "id": None, | ||||||
|  |                 "to_name": "text", | ||||||
|  |                 "hidden": False, | ||||||
|  |                 "read_only": False, | ||||||
|  |             } | ||||||
|  |         ], | ||||||
|  |         "was_canceled": False, | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_stage_with_annotation(): | ||||||
|  |     element = NarrativeText(text="A big brown bear") | ||||||
|  |     annotations = [ | ||||||
|  |         label_studio.LabelStudioAnnotation( | ||||||
|  |             result=[ | ||||||
|  |                 label_studio.LabelStudioResult( | ||||||
|  |                     type="choices", | ||||||
|  |                     value={"choices": ["Positive"]}, | ||||||
|  |                     from_name="sentiment", | ||||||
|  |                     to_name="text", | ||||||
|  |                 ) | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |     label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) | ||||||
|  |     assert label_studio_data == [ | ||||||
|  |         { | ||||||
|  |             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, | ||||||
|  |             "annotations": [ | ||||||
|  |                 { | ||||||
|  |                     "result": [ | ||||||
|  |                         { | ||||||
|  |                             "type": "choices", | ||||||
|  |                             "value": {"choices": ["Positive"]}, | ||||||
|  |                             "from_name": "sentiment", | ||||||
|  |                             "id": None, | ||||||
|  |                             "to_name": "text", | ||||||
|  |                             "hidden": False, | ||||||
|  |                             "read_only": False, | ||||||
|  |                         } | ||||||
|  |                     ], | ||||||
|  |                     "was_canceled": False, | ||||||
|  |                 } | ||||||
|  |             ], | ||||||
|  |         } | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_stage_with_annotation_for_ner(): | ||||||
|  |     element = NarrativeText(text="A big brown bear") | ||||||
|  |     annotations = [ | ||||||
|  |         label_studio.LabelStudioAnnotation( | ||||||
|  |             result=[ | ||||||
|  |                 label_studio.LabelStudioResult( | ||||||
|  |                     type="labels", | ||||||
|  |                     value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]}, | ||||||
|  |                     from_name="label", | ||||||
|  |                     to_name="text", | ||||||
|  |                 ) | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |     label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) | ||||||
|  |     assert label_studio_data == [ | ||||||
|  |         { | ||||||
|  |             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, | ||||||
|  |             "annotations": [ | ||||||
|  |                 { | ||||||
|  |                     "result": [ | ||||||
|  |                         { | ||||||
|  |                             "type": "labels", | ||||||
|  |                             "value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]}, | ||||||
|  |                             "from_name": "label", | ||||||
|  |                             "id": None, | ||||||
|  |                             "to_name": "text", | ||||||
|  |                             "hidden": False, | ||||||
|  |                             "read_only": False, | ||||||
|  |                         } | ||||||
|  |                     ], | ||||||
|  |                     "was_canceled": False, | ||||||
|  |                 } | ||||||
|  |             ], | ||||||
|  |         } | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_stage_with_annotation_raises_with_mismatched_lengths(): | ||||||
|  |     element = NarrativeText(text="A big brown bear") | ||||||
|  |     annotations = [ | ||||||
|  |         label_studio.LabelStudioAnnotation( | ||||||
|  |             result=[ | ||||||
|  |                 label_studio.LabelStudioResult( | ||||||
|  |                     type="choices", | ||||||
|  |                     value={"choices": ["Positive"]}, | ||||||
|  |                     from_name="sentiment", | ||||||
|  |                     to_name="text", | ||||||
|  |                 ) | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         label_studio.stage_for_label_studio([element], [annotations, annotations]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_stage_with_annotation_raises_with_invalid_type(): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         label_studio.LabelStudioResult( | ||||||
|  |             type="bears", | ||||||
|  |             value={"bears": ["Positive"]}, | ||||||
|  |             from_name="sentiment", | ||||||
|  |             to_name="text", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_stage_with_reviewed_annotation(): | ||||||
|  |     element = NarrativeText(text="A big brown bear") | ||||||
|  |     annotations = [ | ||||||
|  |         label_studio.LabelStudioAnnotation( | ||||||
|  |             result=[ | ||||||
|  |                 label_studio.LabelStudioResult( | ||||||
|  |                     type="choices", | ||||||
|  |                     value={"choices": ["Positive"]}, | ||||||
|  |                     from_name="sentiment", | ||||||
|  |                     to_name="text", | ||||||
|  |                 ) | ||||||
|  |             ], | ||||||
|  |             reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)], | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |     label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) | ||||||
|  |     assert label_studio_data == [ | ||||||
|  |         { | ||||||
|  |             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, | ||||||
|  |             "annotations": [ | ||||||
|  |                 { | ||||||
|  |                     "result": [ | ||||||
|  |                         { | ||||||
|  |                             "type": "choices", | ||||||
|  |                             "value": {"choices": ["Positive"]}, | ||||||
|  |                             "from_name": "sentiment", | ||||||
|  |                             "to_name": "text", | ||||||
|  |                             "id": None, | ||||||
|  |                             "hidden": False, | ||||||
|  |                             "read_only": False, | ||||||
|  |                         } | ||||||
|  |                     ], | ||||||
|  |                     "reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}], | ||||||
|  |                     "was_canceled": False, | ||||||
|  |                 } | ||||||
|  |             ], | ||||||
|  |         } | ||||||
|  |     ] | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.2.1-dev2"  # pragma: no cover | __version__ = "0.2.1-dev3"  # pragma: no cover | ||||||
|  | |||||||
| @ -1,21 +1,126 @@ | |||||||
| from typing import Dict, List | from copy import deepcopy | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from typing import Any, Dict, List, Optional, Union | ||||||
| 
 | 
 | ||||||
| from unstructured.documents.elements import Text | from unstructured.documents.elements import Text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] | LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] | ||||||
| 
 | 
 | ||||||
|  | # NOTE(robinson) - ref: https://labelstud.io/tags/labels.html | ||||||
|  | VALID_LABEL_TYPES = [ | ||||||
|  |     "labels", | ||||||
|  |     "hypertextlabels", | ||||||
|  |     "paragraphlabels", | ||||||
|  |     "rectangle", | ||||||
|  |     "keypoint", | ||||||
|  |     "polygon", | ||||||
|  |     "brush", | ||||||
|  |     "ellipse", | ||||||
|  |     "rectanglelabels", | ||||||
|  |     "keypointlabels", | ||||||
|  |     "polygonlabels", | ||||||
|  |     "brushlabels", | ||||||
|  |     "ellipselabels", | ||||||
|  |     "timeserieslabels", | ||||||
|  |     "choices", | ||||||
|  |     "number", | ||||||
|  |     "taxonomy", | ||||||
|  |     "textarea", | ||||||
|  |     "rating", | ||||||
|  |     "pairwise", | ||||||
|  |     "videorectangle", | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclass | ||||||
|  | class LabelStudioResult: | ||||||
|  |     """Class for representing a LabelStudio annotation result. | ||||||
|  |     ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" | ||||||
|  | 
 | ||||||
|  |     type: str  # The type of tag used to annotate the task | ||||||
|  |     value: Dict[str, Any]  # The values for | ||||||
|  |     from_name: str  # Name of the source object tag (i.e. "sentiment" for the sentiment template) | ||||||
|  |     to_name: str  # Name of the destination control tag | ||||||
|  |     id: Optional[str] = None | ||||||
|  |     hidden: bool = False | ||||||
|  |     read_only: bool = False | ||||||
|  | 
 | ||||||
|  |     def __post_init__(self): | ||||||
|  |         if self.type not in VALID_LABEL_TYPES: | ||||||
|  |             raise ValueError( | ||||||
|  |                 f"{self.type} is not a valid label type. " | ||||||
|  |                 f"Valid label types are: {VALID_LABEL_TYPES}" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def to_dict(self): | ||||||
|  |         return self.__dict__ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclass | ||||||
|  | class LabelStudioReview: | ||||||
|  |     """Class for representing a LablStudio review. Reviews are only available in the | ||||||
|  |     Enterprise offering. | ||||||
|  |     ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" | ||||||
|  | 
 | ||||||
|  |     created_by: Dict[str, Union[str, int]] | ||||||
|  |     accepted: bool | ||||||
|  |     id: Optional[str] = None | ||||||
|  | 
 | ||||||
|  |     def to_dict(self): | ||||||
|  |         return self.__dict__ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclass | ||||||
|  | class LabelStudioAnnotation: | ||||||
|  |     """Class for representing LabelStudio annotations. | ||||||
|  |     ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" | ||||||
|  | 
 | ||||||
|  |     result: List[LabelStudioResult]  # The result of the annotation | ||||||
|  |     id: Optional[str] = None | ||||||
|  |     lead_time: Optional[float] = None  # Time in seconds to label the task | ||||||
|  |     completed_by: Optional[int] = None  # User ID for the user who completed the task | ||||||
|  |     reviews: Optional[List[LabelStudioReview]] = None  # An array of the review results | ||||||
|  |     was_canceled: bool = False  # Indicates whether or not the annotation was canceled | ||||||
|  | 
 | ||||||
|  |     def to_dict(self): | ||||||
|  |         annotation_dict = deepcopy(self.__dict__) | ||||||
|  |         annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]] | ||||||
|  |         if "reviews" in annotation_dict and annotation_dict["reviews"] is not None: | ||||||
|  |             annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]] | ||||||
|  | 
 | ||||||
|  |         # NOTE(robinson) - Removes keys for any fields that defaulted to None | ||||||
|  |         _annotation_dict = deepcopy(annotation_dict) | ||||||
|  |         for key, value in annotation_dict.items(): | ||||||
|  |             if value is None: | ||||||
|  |                 _annotation_dict.pop(key) | ||||||
|  | 
 | ||||||
|  |         return _annotation_dict | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def stage_for_label_studio( | def stage_for_label_studio( | ||||||
|     elements: List[Text], text_field: str = "text", id_field: str = "ref_id" |     elements: List[Text], | ||||||
|  |     annotations: Optional[List[List[LabelStudioAnnotation]]] = None, | ||||||
|  |     text_field: str = "text", | ||||||
|  |     id_field: str = "ref_id", | ||||||
| ) -> LABEL_STUDIO_TYPE: | ) -> LABEL_STUDIO_TYPE: | ||||||
|     """Converts the document to the format required for upload to LabelStudio. |     """Converts the document to the format required for upload to LabelStudio. | ||||||
|     ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" |     ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" | ||||||
|  |     if annotations is not None: | ||||||
|  |         if len(elements) != len(annotations): | ||||||
|  |             raise ValueError("The length of elements and annotations must match.") | ||||||
|  | 
 | ||||||
|     label_studio_data: LABEL_STUDIO_TYPE = list() |     label_studio_data: LABEL_STUDIO_TYPE = list() | ||||||
|     for element in elements: |     for i, element in enumerate(elements): | ||||||
|         data: Dict[str, str] = dict() |         data: Dict[str, str] = dict() | ||||||
|         data[text_field] = element.text |         data[text_field] = element.text | ||||||
|         if isinstance(element.id, str): |         if isinstance(element.id, str): | ||||||
|             data[id_field] = element.id |             data[id_field] = element.id | ||||||
|         label_studio_data.append({"data": data}) | 
 | ||||||
|  |         labeling_example: Dict[str, Any] = dict() | ||||||
|  |         labeling_example["data"] = data | ||||||
|  |         if annotations is not None: | ||||||
|  |             labeling_example["annotations"] = [a.to_dict() for a in annotations[i]] | ||||||
|  |         label_studio_data.append(labeling_example) | ||||||
|  | 
 | ||||||
|     return label_studio_data |     return label_studio_data | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Matt Robinson
						Matt Robinson