mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	feat: Optionally include LabelStudio annotations in staging brick (#19)
* added types for label studio annotations * added method to cast as dicts * added length check for annotations * tweaks to get upload to work * added validation for label types * annotations is a list for each example * little bit of refactoring * test for staging with label studio * tests for error conditions and reviewers * added test for NER annotations * updated changelog and bumped version * added docs with annotation examples * fix label studio link * bump version in sphinx docs * fulle -> full (typo fix)
This commit is contained in:
		
							parent
							
								
									29607c32ba
								
							
						
					
					
						commit
						a950559b94
					
				| @ -1,7 +1,8 @@ | ||||
| ## 0.2.1-dev2 | ||||
| ## 0.2.1-dev3 | ||||
| 
 | ||||
| * Added staging brick for CSV format for Prodigy | ||||
| * Added staging brick for Prodigy | ||||
| * Added ability to upload LabelStudio annotations | ||||
| * Added text_field and id_field to stage_for_label_studio signature | ||||
| 
 | ||||
| ## 0.2.0 | ||||
|  | ||||
| @ -361,6 +361,98 @@ Examples: | ||||
|       json.dump(label_studio_data, f, indent=4) | ||||
| 
 | ||||
| 
 | ||||
| You can also include pre-annotations as part of your LabelStudio upload. The | ||||
| ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of | ||||
| annotations for each element in the ``elements`` list. If an element does not have any annotations, | ||||
| use an empty list. | ||||
| The following shows an example of how to upload annotations for the "Text Classification" | ||||
| task in LabelStudio: | ||||
| 
 | ||||
| .. code:: python | ||||
| 
 | ||||
|   import json | ||||
| 
 | ||||
|   from unstructured.documents.elements import NarrativeText | ||||
|   from unstructured.staging.label_studio import ( | ||||
|       stage_for_label_studio, | ||||
|       LabelStudioAnnotation, | ||||
|       LabelStudioResult, | ||||
|   ) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|   elements = [NarrativeText(text="Narrative")] | ||||
|   annotations = [[ | ||||
|     LabelStudioAnnotation( | ||||
|         result=[ | ||||
|             LabelStudioResult( | ||||
|                 type="choices", | ||||
|                 value={"choices": ["Positive"]}, | ||||
|                 from_name="sentiment", | ||||
|                 to_name="text", | ||||
|             ) | ||||
|         ] | ||||
|     ) | ||||
|   ]] | ||||
|   label_studio_data = stage_for_label_studio( | ||||
|       elements, | ||||
|       annotations=annotations, | ||||
|       text_field="my_text", | ||||
|       id_field="my_id" | ||||
|   ) | ||||
| 
 | ||||
|   # The resulting JSON file is ready to be uploaded to LabelStudio | ||||
|   # with annotations included | ||||
|   with open("label_studio.json", "w") as f: | ||||
|       json.dump(label_studio_data, f, indent=4) | ||||
| 
 | ||||
| 
 | ||||
| The following shows an example of how to upload annotations for the "Named Entity Recognition" | ||||
| task in LabelStudio: | ||||
| 
 | ||||
| .. code:: python | ||||
| 
 | ||||
|   import json | ||||
| 
 | ||||
|   from unstructured.documents.elements import NarrativeText | ||||
|   from unstructured.staging.label_studio import ( | ||||
|       stage_for_label_studio, | ||||
|       LabelStudioAnnotation, | ||||
|       LabelStudioResult, | ||||
|   ) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|   elements = [NarrativeText(text="Narrative")] | ||||
|   annotations = [[ | ||||
|     LabelStudioAnnotation( | ||||
|         result=[ | ||||
|             LabelStudioResult( | ||||
|                 type="labels", | ||||
|                 value={"start": 0, "end": 9, "text": "Narrative", "labels": ["MISC"]}, | ||||
|                 from_name="label", | ||||
|                 to_name="text", | ||||
|             ) | ||||
|         ] | ||||
|     ) | ||||
|   ]] | ||||
|   label_studio_data = stage_for_label_studio( | ||||
|       elements, | ||||
|       annotations=annotations, | ||||
|       text_field="my_text", | ||||
|       id_field="my_id" | ||||
|   ) | ||||
| 
 | ||||
|   # The resulting JSON file is ready to be uploaded to LabelStudio | ||||
|   # with annotations included | ||||
|   with open("label_studio.json", "w") as f: | ||||
|       json.dump(label_studio_data, f, indent=4) | ||||
| 
 | ||||
| 
 | ||||
| See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full list of options | ||||
| for labels and annotations. | ||||
| 
 | ||||
| 
 | ||||
| ``stage_for_prodigy`` | ||||
| -------------------------- | ||||
| 
 | ||||
|  | ||||
| @ -22,7 +22,7 @@ copyright = '2022, Unstructured Technologies' | ||||
| author = 'Unstructured Technologies' | ||||
| 
 | ||||
| # The full version, including alpha/beta/rc tags | ||||
| release = '0.0.1' | ||||
| release = '0.2.1-dev3' | ||||
| 
 | ||||
| 
 | ||||
| # -- General configuration --------------------------------------------------- | ||||
|  | ||||
| @ -28,3 +28,175 @@ def test_specify_text_name(elements): | ||||
| def test_specify_id_name(elements): | ||||
|     label_studio_data = label_studio.stage_for_label_studio(elements, id_field="random_id") | ||||
|     assert "random_id" in label_studio_data[0]["data"] | ||||
| 
 | ||||
| 
 | ||||
| def test_created_annotation(): | ||||
|     annotation = label_studio.LabelStudioAnnotation( | ||||
|         result=[ | ||||
|             label_studio.LabelStudioResult( | ||||
|                 type="choices", | ||||
|                 value={"choices": ["Positive"]}, | ||||
|                 from_name="sentiment", | ||||
|                 to_name="text", | ||||
|             ) | ||||
|         ] | ||||
|     ) | ||||
| 
 | ||||
|     annotation.to_dict() == { | ||||
|         "result": [ | ||||
|             { | ||||
|                 "type": "choices", | ||||
|                 "value": {"choices": ["Positive"]}, | ||||
|                 "from_name": "sentiment", | ||||
|                 "id": None, | ||||
|                 "to_name": "text", | ||||
|                 "hidden": False, | ||||
|                 "read_only": False, | ||||
|             } | ||||
|         ], | ||||
|         "was_canceled": False, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def test_stage_with_annotation(): | ||||
|     element = NarrativeText(text="A big brown bear") | ||||
|     annotations = [ | ||||
|         label_studio.LabelStudioAnnotation( | ||||
|             result=[ | ||||
|                 label_studio.LabelStudioResult( | ||||
|                     type="choices", | ||||
|                     value={"choices": ["Positive"]}, | ||||
|                     from_name="sentiment", | ||||
|                     to_name="text", | ||||
|                 ) | ||||
|             ] | ||||
|         ) | ||||
|     ] | ||||
|     label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) | ||||
|     assert label_studio_data == [ | ||||
|         { | ||||
|             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, | ||||
|             "annotations": [ | ||||
|                 { | ||||
|                     "result": [ | ||||
|                         { | ||||
|                             "type": "choices", | ||||
|                             "value": {"choices": ["Positive"]}, | ||||
|                             "from_name": "sentiment", | ||||
|                             "id": None, | ||||
|                             "to_name": "text", | ||||
|                             "hidden": False, | ||||
|                             "read_only": False, | ||||
|                         } | ||||
|                     ], | ||||
|                     "was_canceled": False, | ||||
|                 } | ||||
|             ], | ||||
|         } | ||||
|     ] | ||||
| 
 | ||||
| 
 | ||||
| def test_stage_with_annotation_for_ner(): | ||||
|     element = NarrativeText(text="A big brown bear") | ||||
|     annotations = [ | ||||
|         label_studio.LabelStudioAnnotation( | ||||
|             result=[ | ||||
|                 label_studio.LabelStudioResult( | ||||
|                     type="labels", | ||||
|                     value={"start": 12, "end": 16, "text": "bear", "labels": ["PER"]}, | ||||
|                     from_name="label", | ||||
|                     to_name="text", | ||||
|                 ) | ||||
|             ] | ||||
|         ) | ||||
|     ] | ||||
|     label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) | ||||
|     assert label_studio_data == [ | ||||
|         { | ||||
|             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, | ||||
|             "annotations": [ | ||||
|                 { | ||||
|                     "result": [ | ||||
|                         { | ||||
|                             "type": "labels", | ||||
|                             "value": {"start": 12, "end": 16, "text": "bear", "labels": ["PER"]}, | ||||
|                             "from_name": "label", | ||||
|                             "id": None, | ||||
|                             "to_name": "text", | ||||
|                             "hidden": False, | ||||
|                             "read_only": False, | ||||
|                         } | ||||
|                     ], | ||||
|                     "was_canceled": False, | ||||
|                 } | ||||
|             ], | ||||
|         } | ||||
|     ] | ||||
| 
 | ||||
| 
 | ||||
| def test_stage_with_annotation_raises_with_mismatched_lengths(): | ||||
|     element = NarrativeText(text="A big brown bear") | ||||
|     annotations = [ | ||||
|         label_studio.LabelStudioAnnotation( | ||||
|             result=[ | ||||
|                 label_studio.LabelStudioResult( | ||||
|                     type="choices", | ||||
|                     value={"choices": ["Positive"]}, | ||||
|                     from_name="sentiment", | ||||
|                     to_name="text", | ||||
|                 ) | ||||
|             ] | ||||
|         ) | ||||
|     ] | ||||
|     with pytest.raises(ValueError): | ||||
|         label_studio.stage_for_label_studio([element], [annotations, annotations]) | ||||
| 
 | ||||
| 
 | ||||
| def test_stage_with_annotation_raises_with_invalid_type(): | ||||
|     with pytest.raises(ValueError): | ||||
|         label_studio.LabelStudioResult( | ||||
|             type="bears", | ||||
|             value={"bears": ["Positive"]}, | ||||
|             from_name="sentiment", | ||||
|             to_name="text", | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def test_stage_with_reviewed_annotation(): | ||||
|     element = NarrativeText(text="A big brown bear") | ||||
|     annotations = [ | ||||
|         label_studio.LabelStudioAnnotation( | ||||
|             result=[ | ||||
|                 label_studio.LabelStudioResult( | ||||
|                     type="choices", | ||||
|                     value={"choices": ["Positive"]}, | ||||
|                     from_name="sentiment", | ||||
|                     to_name="text", | ||||
|                 ) | ||||
|             ], | ||||
|             reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)], | ||||
|         ) | ||||
|     ] | ||||
|     label_studio_data = label_studio.stage_for_label_studio([element], [annotations]) | ||||
|     assert label_studio_data == [ | ||||
|         { | ||||
|             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"}, | ||||
|             "annotations": [ | ||||
|                 { | ||||
|                     "result": [ | ||||
|                         { | ||||
|                             "type": "choices", | ||||
|                             "value": {"choices": ["Positive"]}, | ||||
|                             "from_name": "sentiment", | ||||
|                             "to_name": "text", | ||||
|                             "id": None, | ||||
|                             "hidden": False, | ||||
|                             "read_only": False, | ||||
|                         } | ||||
|                     ], | ||||
|                     "reviews": [{"created_by": {"user_id": 1}, "accepted": True, "id": None}], | ||||
|                     "was_canceled": False, | ||||
|                 } | ||||
|             ], | ||||
|         } | ||||
|     ] | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| __version__ = "0.2.1-dev2"  # pragma: no cover | ||||
| __version__ = "0.2.1-dev3"  # pragma: no cover | ||||
|  | ||||
| @ -1,21 +1,126 @@ | ||||
| from typing import Dict, List | ||||
| from copy import deepcopy | ||||
| from dataclasses import dataclass | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| from unstructured.documents.elements import Text | ||||
| 
 | ||||
| 
 | ||||
| LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] | ||||
| 
 | ||||
| # NOTE(robinson) - ref: https://labelstud.io/tags/labels.html | ||||
| VALID_LABEL_TYPES = [ | ||||
|     "labels", | ||||
|     "hypertextlabels", | ||||
|     "paragraphlabels", | ||||
|     "rectangle", | ||||
|     "keypoint", | ||||
|     "polygon", | ||||
|     "brush", | ||||
|     "ellipse", | ||||
|     "rectanglelabels", | ||||
|     "keypointlabels", | ||||
|     "polygonlabels", | ||||
|     "brushlabels", | ||||
|     "ellipselabels", | ||||
|     "timeserieslabels", | ||||
|     "choices", | ||||
|     "number", | ||||
|     "taxonomy", | ||||
|     "textarea", | ||||
|     "rating", | ||||
|     "pairwise", | ||||
|     "videorectangle", | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class LabelStudioResult: | ||||
|     """Class for representing a LabelStudio annotation result. | ||||
|     ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" | ||||
| 
 | ||||
|     type: str  # The type of tag used to annotate the task | ||||
|     value: Dict[str, Any]  # The values for | ||||
|     from_name: str  # Name of the source object tag (i.e. "sentiment" for the sentiment template) | ||||
|     to_name: str  # Name of the destination control tag | ||||
|     id: Optional[str] = None | ||||
|     hidden: bool = False | ||||
|     read_only: bool = False | ||||
| 
 | ||||
|     def __post_init__(self): | ||||
|         if self.type not in VALID_LABEL_TYPES: | ||||
|             raise ValueError( | ||||
|                 f"{self.type} is not a valid label type. " | ||||
|                 f"Valid label types are: {VALID_LABEL_TYPES}" | ||||
|             ) | ||||
| 
 | ||||
|     def to_dict(self): | ||||
|         return self.__dict__ | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class LabelStudioReview: | ||||
|     """Class for representing a LablStudio review. Reviews are only available in the | ||||
|     Enterprise offering. | ||||
|     ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" | ||||
| 
 | ||||
|     created_by: Dict[str, Union[str, int]] | ||||
|     accepted: bool | ||||
|     id: Optional[str] = None | ||||
| 
 | ||||
|     def to_dict(self): | ||||
|         return self.__dict__ | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class LabelStudioAnnotation: | ||||
|     """Class for representing LabelStudio annotations. | ||||
|     ref: https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks""" | ||||
| 
 | ||||
|     result: List[LabelStudioResult]  # The result of the annotation | ||||
|     id: Optional[str] = None | ||||
|     lead_time: Optional[float] = None  # Time in seconds to label the task | ||||
|     completed_by: Optional[int] = None  # User ID for the user who completed the task | ||||
|     reviews: Optional[List[LabelStudioReview]] = None  # An array of the review results | ||||
|     was_canceled: bool = False  # Indicates whether or not the annotation was canceled | ||||
| 
 | ||||
|     def to_dict(self): | ||||
|         annotation_dict = deepcopy(self.__dict__) | ||||
|         annotation_dict["result"] = [r.to_dict() for r in annotation_dict["result"]] | ||||
|         if "reviews" in annotation_dict and annotation_dict["reviews"] is not None: | ||||
|             annotation_dict["reviews"] = [r.to_dict() for r in annotation_dict["reviews"]] | ||||
| 
 | ||||
|         # NOTE(robinson) - Removes keys for any fields that defaulted to None | ||||
|         _annotation_dict = deepcopy(annotation_dict) | ||||
|         for key, value in annotation_dict.items(): | ||||
|             if value is None: | ||||
|                 _annotation_dict.pop(key) | ||||
| 
 | ||||
|         return _annotation_dict | ||||
| 
 | ||||
| 
 | ||||
| def stage_for_label_studio( | ||||
|     elements: List[Text], text_field: str = "text", id_field: str = "ref_id" | ||||
|     elements: List[Text], | ||||
|     annotations: Optional[List[List[LabelStudioAnnotation]]] = None, | ||||
|     text_field: str = "text", | ||||
|     id_field: str = "ref_id", | ||||
| ) -> LABEL_STUDIO_TYPE: | ||||
|     """Converts the document to the format required for upload to LabelStudio. | ||||
|     ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" | ||||
|     if annotations is not None: | ||||
|         if len(elements) != len(annotations): | ||||
|             raise ValueError("The length of elements and annotations must match.") | ||||
| 
 | ||||
|     label_studio_data: LABEL_STUDIO_TYPE = list() | ||||
|     for element in elements: | ||||
|     for i, element in enumerate(elements): | ||||
|         data: Dict[str, str] = dict() | ||||
|         data[text_field] = element.text | ||||
|         if isinstance(element.id, str): | ||||
|             data[id_field] = element.id | ||||
|         label_studio_data.append({"data": data}) | ||||
| 
 | ||||
|         labeling_example: Dict[str, Any] = dict() | ||||
|         labeling_example["data"] = data | ||||
|         if annotations is not None: | ||||
|             labeling_example["annotations"] = [a.to_dict() for a in annotations[i]] | ||||
|         label_studio_data.append(labeling_example) | ||||
| 
 | ||||
|     return label_studio_data | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Matt Robinson
						Matt Robinson