diff --git a/docs/requirements.txt b/docs/requirements.txt
index 564c26d73..bb6c562e4 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -20,7 +20,7 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
-importlib-metadata==4.12.0
+importlib-metadata==5.0.0
# via sphinx
jinja2==3.1.2
# via sphinx
@@ -38,7 +38,7 @@ requests==2.28.1
# via sphinx
snowballstemmer==2.2.0
# via sphinx
-sphinx==5.1.1
+sphinx==5.2.3
# via
# -r requirements/build.in
# sphinx-rtd-theme
diff --git a/requirements/build.txt b/requirements/build.txt
index 3064ff5ec..bb6c562e4 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -20,6 +20,8 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
+importlib-metadata==5.0.0
+ # via sphinx
jinja2==3.1.2
# via sphinx
markupsafe==2.1.1
@@ -38,10 +40,10 @@ snowballstemmer==2.2.0
# via sphinx
sphinx==5.2.3
# via
- # -r build.in
+ # -r requirements/build.in
# sphinx-rtd-theme
sphinx-rtd-theme==1.0.0
- # via -r build.in
+ # via -r requirements/build.in
sphinxcontrib-applehelp==1.0.2
# via sphinx
sphinxcontrib-devhelp==1.0.2
@@ -56,3 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==1.26.12
# via requests
+zipp==3.8.1
+ # via importlib-metadata
diff --git a/requirements/test.in b/requirements/test.in
index 3b50deae3..4b453feeb 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -7,3 +7,5 @@ click>=8.1
flake8
mypy
pytest-cov
+label_studio_sdk
+vcrpy
diff --git a/requirements/test.txt b/requirements/test.txt
index 68bc54cb5..8c7b853f1 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -7,23 +7,37 @@
attrs==22.1.0
# via pytest
black==22.8.0
- # via -r test.in
+ # via -r requirements/test.in
+certifi==2022.9.24
+ # via requests
+charset-normalizer==2.1.1
+ # via requests
click==8.1.3
# via
- # -r test.in
+ # -r requirements/test.in
# black
coverage[toml]==6.4.4
# via
- # -r test.in
+ # -r requirements/test.in
# pytest-cov
flake8==5.0.4
- # via -r test.in
+ # via -r requirements/test.in
+idna==3.4
+ # via
+ # requests
+ # yarl
iniconfig==1.1.1
# via pytest
+label-studio-sdk==0.0.15
+ # via -r requirements/test.in
+lxml==4.9.1
+ # via label-studio-sdk
mccabe==0.7.0
# via flake8
+multidict==6.0.2
+ # via yarl
mypy==0.982
- # via -r test.in
+ # via -r requirements/test.in
mypy-extensions==0.4.3
# via
# black
@@ -40,6 +54,8 @@ py==1.11.0
# via pytest
pycodestyle==2.9.1
# via flake8
+pydantic==1.8.2
+ # via label-studio-sdk
pyflakes==2.5.0
# via flake8
pyparsing==3.0.9
@@ -47,7 +63,13 @@ pyparsing==3.0.9
pytest==7.1.3
# via pytest-cov
pytest-cov==4.0.0
- # via -r test.in
+ # via -r requirements/test.in
+pyyaml==6.0
+ # via vcrpy
+requests==2.28.1
+ # via label-studio-sdk
+six==1.16.0
+ # via vcrpy
tomli==2.0.1
# via
# black
@@ -55,4 +77,15 @@ tomli==2.0.1
# mypy
# pytest
typing-extensions==4.3.0
- # via mypy
+ # via
+ # black
+ # mypy
+ # pydantic
+urllib3==1.26.12
+ # via requests
+vcrpy==4.2.1
+ # via -r requirements/test.in
+wrapt==1.14.1
+ # via vcrpy
+yarl==1.8.1
+ # via vcrpy
diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py
index 1514ca3de..3bc841181 100644
--- a/test_unstructured/staging/test_label_studio.py
+++ b/test_unstructured/staging/test_label_studio.py
@@ -3,12 +3,69 @@ import unstructured.staging.label_studio as label_studio
from unstructured.documents.elements import Title, NarrativeText
+from label_studio_sdk.client import Client
+
+import logging
+import re
+import vcr
+
@pytest.fixture
def elements():
return [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
+@vcr.use_cassette("test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml")
+def test_upload_label_studio_data_with_sdk(caplog, elements):
+ """
+ Testing Instructions
+ ====================
+ 1. Remove file `test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml`,
+ which will be recreated later.
+ 2. Install the label-studio package by running command `pip install -U label-studio`.
+ 3. Run command `label-studio`, and login or set up label studio account on pop-up website.
+ 4. Update `LABEL_STUDIO_URL` and `API_KEY` below, you can find your API_KEY by
+ clicking into your account profile.
+ 5. Run this test once, and VCR will record the HTTP request to the yaml file.
+ 6. Kill the label studio instance and run the test again, VCR will replay the response.
+ """
+ log = logging.getLogger("urllib3")
+ log.setLevel(logging.DEBUG)
+ # Define the URL where Label Studio is accessible
+ LABEL_STUDIO_URL = "http://localhost:8080"
+ # API_KEY is a temporary key from local install not actually valid anywhere
+ # Update it if the vcr cassette is updated with the API key from your user account
+ API_KEY = "d44b92c31f592583bffb7e0d817a60c16a937bca"
+ # Connect to the Label Studio API and check the connection
+ ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)
+ ls.check_connection()
+ ls.delete_all_projects()
+ # Create a sample project to classify types of texts
+ project = ls.start_project(
+ title="Text Type Classifications",
+ label_config="""
+
+
+
+
+
+
+
+
+
+
+ """,
+ )
+ label_studio_data = label_studio.stage_for_label_studio(elements)
+ project.import_tasks(label_studio_data)
+ # Check success status code (201) for posting tasks job in logger info
+ success_posting_tasks_status = re.compile(r"POST /api/projects/.*/import.*201")
+ assert bool(success_posting_tasks_status.search(caplog.text))
+
+
def test_convert_to_label_studio_data(elements):
label_studio_data = label_studio.stage_for_label_studio(elements)
diff --git a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml
new file mode 100644
index 000000000..fcfb2b726
--- /dev/null
+++ b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml
@@ -0,0 +1,319 @@
+interactions:
+- request:
+ body: null
+ headers:
+ Accept:
+ - '*/*'
+ Accept-Encoding:
+ - gzip, deflate, br
+ Authorization:
+ - Token d44b92c31f592583bffb7e0d817a60c16a937bca
+ Connection:
+ - keep-alive
+ User-Agent:
+ - python-requests/2.27.1
+ method: GET
+ uri: http://localhost:8080/health
+ response:
+ body:
+ string: '{"status": "UP"}'
+ headers:
+ Content-Language:
+ - en-us
+ Content-Length:
+ - '16'
+ Content-Type:
+ - text/html; charset=utf-8
+ Date:
+ - Wed, 05 Oct 2022 16:26:14 GMT
+ Referrer-Policy:
+ - same-origin
+ Server:
+ - WSGIServer/0.2 CPython/3.8.13
+ Set-Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w;
+ expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/;
+ SameSite=Lax
+ Vary:
+ - Accept-Language, Cookie, Origin
+ X-Content-Type-Options:
+ - nosniff
+ X-Frame-Options:
+ - DENY
+ status:
+ code: 200
+ message: OK
+- request:
+ body: null
+ headers:
+ Accept:
+ - '*/*'
+ Accept-Encoding:
+ - gzip, deflate, br
+ Authorization:
+ - Token d44b92c31f592583bffb7e0d817a60c16a937bca
+ Connection:
+ - keep-alive
+ Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w
+ User-Agent:
+ - python-requests/2.27.1
+ method: GET
+ uri: http://localhost:8080/api/projects?page_size=10000000
+ response:
+ body:
+ string: '{"count":1,"next":null,"previous":null,"results":[{"id":95,"title":"Text
+ Type Classifications","description":"","label_config":"\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":1,"first_name":"","last_name":"","email":"yuming@unstructured.io","avatar":null},"created_at":"2022-10-05T16:15:26.800180Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential
+ sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null}]}'
+ headers:
+ Allow:
+ - GET, POST, HEAD, OPTIONS
+ Content-Language:
+ - en-us
+ Content-Length:
+ - '2002'
+ Content-Type:
+ - application/json
+ Date:
+ - Wed, 05 Oct 2022 16:26:14 GMT
+ Referrer-Policy:
+ - same-origin
+ Server:
+ - WSGIServer/0.2 CPython/3.8.13
+ Set-Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w;
+ expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/;
+ SameSite=Lax
+ Vary:
+ - Accept-Language, Cookie, Origin
+ X-Content-Type-Options:
+ - nosniff
+ X-Frame-Options:
+ - DENY
+ status:
+ code: 200
+ message: OK
+- request:
+ body: null
+ headers:
+ Accept:
+ - '*/*'
+ Accept-Encoding:
+ - gzip, deflate, br
+ Authorization:
+ - Token d44b92c31f592583bffb7e0d817a60c16a937bca
+ Connection:
+ - keep-alive
+ Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w
+ User-Agent:
+ - python-requests/2.27.1
+ method: GET
+ uri: http://localhost:8080/api/projects/95
+ response:
+ body:
+ string: '{"id":95,"title":"Text Type Classifications","description":"","label_config":"\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":1,"first_name":"","last_name":"","email":"yuming@unstructured.io","avatar":null},"created_at":"2022-10-05T16:15:26.800180Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential
+ sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null}'
+ headers:
+ Allow:
+ - GET, PUT, PATCH, DELETE, HEAD, OPTIONS
+ Content-Language:
+ - en-us
+ Content-Length:
+ - '1950'
+ Content-Type:
+ - application/json
+ Date:
+ - Wed, 05 Oct 2022 16:26:14 GMT
+ Referrer-Policy:
+ - same-origin
+ Server:
+ - WSGIServer/0.2 CPython/3.8.13
+ Set-Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w;
+ expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/;
+ SameSite=Lax
+ Vary:
+ - Accept-Language, Cookie, Origin
+ X-Content-Type-Options:
+ - nosniff
+ X-Frame-Options:
+ - DENY
+ status:
+ code: 200
+ message: OK
+- request:
+ body: null
+ headers:
+ Accept:
+ - '*/*'
+ Accept-Encoding:
+ - gzip, deflate, br
+ Authorization:
+ - Token d44b92c31f592583bffb7e0d817a60c16a937bca
+ Connection:
+ - keep-alive
+ Content-Length:
+ - '0'
+ Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w
+ User-Agent:
+ - python-requests/2.27.1
+ method: DELETE
+ uri: http://localhost:8080/api/projects/95/
+ response:
+ body:
+ string: ''
+ headers:
+ Allow:
+ - GET, PUT, PATCH, DELETE, HEAD, OPTIONS
+ Content-Language:
+ - en-us
+ Content-Length:
+ - '0'
+ Date:
+ - Wed, 05 Oct 2022 16:26:14 GMT
+ Referrer-Policy:
+ - same-origin
+ Server:
+ - WSGIServer/0.2 CPython/3.8.13
+ Set-Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w;
+ expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/;
+ SameSite=Lax
+ Vary:
+ - Accept-Language, Cookie, Origin
+ X-Content-Type-Options:
+ - nosniff
+ X-Frame-Options:
+ - DENY
+ status:
+ code: 204
+ message: No Content
+- request:
+ body: '{"title": "Text Type Classifications", "label_config": "\n \n \n \n \n \n \n \n \n \n \n "}'
+ headers:
+ Accept:
+ - '*/*'
+ Accept-Encoding:
+ - gzip, deflate, br
+ Authorization:
+ - Token d44b92c31f592583bffb7e0d817a60c16a937bca
+ Connection:
+ - keep-alive
+ Content-Length:
+ - '591'
+ Content-Type:
+ - application/json
+ Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w
+ User-Agent:
+ - python-requests/2.27.1
+ method: POST
+ uri: http://localhost:8080/api/projects
+ response:
+ body:
+ string: '{"id":96,"title":"Text Type Classifications","description":"","label_config":"\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":1,"first_name":"","last_name":"","email":"yuming@unstructured.io","avatar":null},"created_at":"2022-10-05T16:26:14.756037Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":null,"task_number":null,"useful_annotation_number":null,"ground_truth_number":null,"skipped_annotations_number":null,"total_annotations_number":null,"total_predictions_number":null,"sampling":"Sequential
+ sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null}'
+ headers:
+ Allow:
+ - GET, POST, HEAD, OPTIONS
+ Content-Language:
+ - en-us
+ Content-Length:
+ - '1971'
+ Content-Type:
+ - application/json
+ Date:
+ - Wed, 05 Oct 2022 16:26:14 GMT
+ Referrer-Policy:
+ - same-origin
+ Server:
+ - WSGIServer/0.2 CPython/3.8.13
+ Set-Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w;
+ expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/;
+ SameSite=Lax
+ Vary:
+ - Accept-Language, Cookie, Origin
+ X-Content-Type-Options:
+ - nosniff
+ X-Frame-Options:
+ - DENY
+ status:
+ code: 201
+ message: Created
+- request:
+ body: '[{"data": {"text": "Title 1", "ref_id": "ab03af41c2940e7584b62df48a964db3"}},
+ {"data": {"text": "Narrative 1", "ref_id": "ff9eb806beb1f483322f6fbda680b08b"}}]'
+ headers:
+ Accept:
+ - '*/*'
+ Accept-Encoding:
+ - gzip, deflate, br
+ Authorization:
+ - Token d44b92c31f592583bffb7e0d817a60c16a937bca
+ Connection:
+ - keep-alive
+ Content-Length:
+ - '158'
+ Content-Type:
+ - application/json
+ Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w
+ User-Agent:
+ - python-requests/2.27.1
+ method: POST
+ uri: http://localhost:8080/api/projects/96/import?return_task_ids=1
+ response:
+ body:
+ string: '{"task_count":2,"annotation_count":0,"prediction_count":0,"duration":0.012760162353515625,"file_upload_ids":[],"could_be_tasks_list":false,"found_formats":[],"data_columns":[],"task_ids":[1,2]}'
+ headers:
+ Allow:
+ - POST, OPTIONS
+ Content-Language:
+ - en-us
+ Content-Length:
+ - '193'
+ Content-Type:
+ - application/json
+ Date:
+ - Wed, 05 Oct 2022 16:26:14 GMT
+ Referrer-Policy:
+ - same-origin
+ Server:
+ - WSGIServer/0.2 CPython/3.8.13
+ Set-Cookie:
+ - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w;
+ expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/;
+ SameSite=Lax
+ Vary:
+ - Accept-Language, Cookie, Origin
+ X-Content-Type-Options:
+ - nosniff
+ X-Frame-Options:
+ - DENY
+ status:
+ code: 201
+ message: Created
+version: 1