diff --git a/docs/requirements.txt b/docs/requirements.txt index 564c26d73..bb6c562e4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -20,7 +20,7 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==4.12.0 +importlib-metadata==5.0.0 # via sphinx jinja2==3.1.2 # via sphinx @@ -38,7 +38,7 @@ requests==2.28.1 # via sphinx snowballstemmer==2.2.0 # via sphinx -sphinx==5.1.1 +sphinx==5.2.3 # via # -r requirements/build.in # sphinx-rtd-theme diff --git a/requirements/build.txt b/requirements/build.txt index 3064ff5ec..bb6c562e4 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -20,6 +20,8 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx +importlib-metadata==5.0.0 + # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.1 @@ -38,10 +40,10 @@ snowballstemmer==2.2.0 # via sphinx sphinx==5.2.3 # via - # -r build.in + # -r requirements/build.in # sphinx-rtd-theme sphinx-rtd-theme==1.0.0 - # via -r build.in + # via -r requirements/build.in sphinxcontrib-applehelp==1.0.2 # via sphinx sphinxcontrib-devhelp==1.0.2 @@ -56,3 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx urllib3==1.26.12 # via requests +zipp==3.8.1 + # via importlib-metadata diff --git a/requirements/test.in b/requirements/test.in index 3b50deae3..4b453feeb 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -7,3 +7,5 @@ click>=8.1 flake8 mypy pytest-cov +label_studio_sdk +vcrpy diff --git a/requirements/test.txt b/requirements/test.txt index 68bc54cb5..8c7b853f1 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -7,23 +7,37 @@ attrs==22.1.0 # via pytest black==22.8.0 - # via -r test.in + # via -r requirements/test.in +certifi==2022.9.24 + # via requests +charset-normalizer==2.1.1 + # via requests click==8.1.3 # via - # -r test.in + # -r requirements/test.in # black coverage[toml]==6.4.4 # via - # -r test.in + # -r requirements/test.in # pytest-cov flake8==5.0.4 - # via -r test.in + # via -r requirements/test.in +idna==3.4 + # via + # requests + # yarl iniconfig==1.1.1 # via pytest +label-studio-sdk==0.0.15 + # via -r requirements/test.in +lxml==4.9.1 + # via label-studio-sdk mccabe==0.7.0 # via flake8 +multidict==6.0.2 + # via yarl mypy==0.982 - # via -r test.in + # via -r requirements/test.in mypy-extensions==0.4.3 # via # black @@ -40,6 +54,8 @@ py==1.11.0 # via pytest pycodestyle==2.9.1 # via flake8 +pydantic==1.8.2 + # via label-studio-sdk pyflakes==2.5.0 # via flake8 pyparsing==3.0.9 @@ -47,7 +63,13 @@ pyparsing==3.0.9 pytest==7.1.3 # via pytest-cov pytest-cov==4.0.0 - # via -r test.in + # via -r requirements/test.in +pyyaml==6.0 + # via vcrpy +requests==2.28.1 + # via label-studio-sdk +six==1.16.0 + # via vcrpy tomli==2.0.1 # via # black @@ -55,4 +77,15 @@ tomli==2.0.1 # mypy # pytest typing-extensions==4.3.0 - # via mypy + # via + # black + # mypy + # pydantic +urllib3==1.26.12 + # via requests +vcrpy==4.2.1 + # via -r requirements/test.in +wrapt==1.14.1 + # via vcrpy +yarl==1.8.1 + # via vcrpy diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py index 1514ca3de..3bc841181 100644 --- a/test_unstructured/staging/test_label_studio.py +++ b/test_unstructured/staging/test_label_studio.py @@ -3,12 +3,69 @@ import unstructured.staging.label_studio as label_studio from unstructured.documents.elements import Title, NarrativeText +from label_studio_sdk.client import Client + +import logging +import re +import vcr + @pytest.fixture def elements(): return [Title(text="Title 1"), NarrativeText(text="Narrative 1")] +@vcr.use_cassette("test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml") +def test_upload_label_studio_data_with_sdk(caplog, elements): + """ + Testing Instructions + ==================== + 1. Remove file `test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml`, + which will be recreated later. + 2. Install the label-studio package by running command `pip install -U label-studio`. + 3. Run command `label-studio`, and login or set up label studio account on pop-up website. + 4. Update `LABEL_STUDIO_URL` and `API_KEY` below, you can find your API_KEY by + clicking into your account profile. + 5. Run this test once, and VCR will record the HTTP request to the yaml file. + 6. Kill the label studio instance and run the test again, VCR will replay the response. + """ + log = logging.getLogger("urllib3") + log.setLevel(logging.DEBUG) + # Define the URL where Label Studio is accessible + LABEL_STUDIO_URL = "http://localhost:8080" + # API_KEY is a temporary key from local install not actually valid anywhere + # Update it if the vcr cassette is updated with the API key from your user account + API_KEY = "d44b92c31f592583bffb7e0d817a60c16a937bca" + # Connect to the Label Studio API and check the connection + ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY) + ls.check_connection() + ls.delete_all_projects() + # Create a sample project to classify types of texts + project = ls.start_project( + title="Text Type Classifications", + label_config=""" + + + +
+ + + + + + + """, + ) + label_studio_data = label_studio.stage_for_label_studio(elements) + project.import_tasks(label_studio_data) + # Check success status code (201) for posting tasks job in logger info + success_posting_tasks_status = re.compile(r"POST /api/projects/.*/import.*201") + assert bool(success_posting_tasks_status.search(caplog.text)) + + def test_convert_to_label_studio_data(elements): label_studio_data = label_studio.stage_for_label_studio(elements) diff --git a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml new file mode 100644 index 000000000..fcfb2b726 --- /dev/null +++ b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml @@ -0,0 +1,319 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Authorization: + - Token d44b92c31f592583bffb7e0d817a60c16a937bca + Connection: + - keep-alive + User-Agent: + - python-requests/2.27.1 + method: GET + uri: http://localhost:8080/health + response: + body: + string: '{"status": "UP"}' + headers: + Content-Language: + - en-us + Content-Length: + - '16' + Content-Type: + - text/html; charset=utf-8 + Date: + - Wed, 05 Oct 2022 16:26:14 GMT + Referrer-Policy: + - same-origin + Server: + - WSGIServer/0.2 CPython/3.8.13 + Set-Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w; + expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/; + SameSite=Lax + Vary: + - Accept-Language, Cookie, Origin + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Authorization: + - Token d44b92c31f592583bffb7e0d817a60c16a937bca + Connection: + - keep-alive + Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w + User-Agent: + - python-requests/2.27.1 + method: GET + uri: http://localhost:8080/api/projects?page_size=10000000 + response: + body: + string: '{"count":1,"next":null,"previous":null,"results":[{"id":95,"title":"Text + Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":1,"first_name":"","last_name":"","email":"yuming@unstructured.io","avatar":null},"created_at":"2022-10-05T16:15:26.800180Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential + sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null}]}' + headers: + Allow: + - GET, POST, HEAD, OPTIONS + Content-Language: + - en-us + Content-Length: + - '2002' + Content-Type: + - application/json + Date: + - Wed, 05 Oct 2022 16:26:14 GMT + Referrer-Policy: + - same-origin + Server: + - WSGIServer/0.2 CPython/3.8.13 + Set-Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w; + expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/; + SameSite=Lax + Vary: + - Accept-Language, Cookie, Origin + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Authorization: + - Token d44b92c31f592583bffb7e0d817a60c16a937bca + Connection: + - keep-alive + Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w + User-Agent: + - python-requests/2.27.1 + method: GET + uri: http://localhost:8080/api/projects/95 + response: + body: + string: '{"id":95,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":1,"first_name":"","last_name":"","email":"yuming@unstructured.io","avatar":null},"created_at":"2022-10-05T16:15:26.800180Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential + sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Content-Language: + - en-us + Content-Length: + - '1950' + Content-Type: + - application/json + Date: + - Wed, 05 Oct 2022 16:26:14 GMT + Referrer-Policy: + - same-origin + Server: + - WSGIServer/0.2 CPython/3.8.13 + Set-Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w; + expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/; + SameSite=Lax + Vary: + - Accept-Language, Cookie, Origin + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Authorization: + - Token d44b92c31f592583bffb7e0d817a60c16a937bca + Connection: + - keep-alive + Content-Length: + - '0' + Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w + User-Agent: + - python-requests/2.27.1 + method: DELETE + uri: http://localhost:8080/api/projects/95/ + response: + body: + string: '' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Content-Language: + - en-us + Content-Length: + - '0' + Date: + - Wed, 05 Oct 2022 16:26:14 GMT + Referrer-Policy: + - same-origin + Server: + - WSGIServer/0.2 CPython/3.8.13 + Set-Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w; + expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/; + SameSite=Lax + Vary: + - Accept-Language, Cookie, Origin + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 204 + message: No Content +- request: + body: '{"title": "Text Type Classifications", "label_config": "\n \n \n \n
\n \n \n \n \n \n \n "}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Authorization: + - Token d44b92c31f592583bffb7e0d817a60c16a937bca + Connection: + - keep-alive + Content-Length: + - '591' + Content-Type: + - application/json + Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w + User-Agent: + - python-requests/2.27.1 + method: POST + uri: http://localhost:8080/api/projects + response: + body: + string: '{"id":96,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":1,"first_name":"","last_name":"","email":"yuming@unstructured.io","avatar":null},"created_at":"2022-10-05T16:26:14.756037Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":null,"task_number":null,"useful_annotation_number":null,"ground_truth_number":null,"skipped_annotations_number":null,"total_annotations_number":null,"total_predictions_number":null,"sampling":"Sequential + sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null}' + headers: + Allow: + - GET, POST, HEAD, OPTIONS + Content-Language: + - en-us + Content-Length: + - '1971' + Content-Type: + - application/json + Date: + - Wed, 05 Oct 2022 16:26:14 GMT + Referrer-Policy: + - same-origin + Server: + - WSGIServer/0.2 CPython/3.8.13 + Set-Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w; + expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/; + SameSite=Lax + Vary: + - Accept-Language, Cookie, Origin + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 201 + message: Created +- request: + body: '[{"data": {"text": "Title 1", "ref_id": "ab03af41c2940e7584b62df48a964db3"}}, + {"data": {"text": "Narrative 1", "ref_id": "ff9eb806beb1f483322f6fbda680b08b"}}]' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Authorization: + - Token d44b92c31f592583bffb7e0d817a60c16a937bca + Connection: + - keep-alive + Content-Length: + - '158' + Content-Type: + - application/json + Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w + User-Agent: + - python-requests/2.27.1 + method: POST + uri: http://localhost:8080/api/projects/96/import?return_task_ids=1 + response: + body: + string: '{"task_count":2,"annotation_count":0,"prediction_count":0,"duration":0.012760162353515625,"file_upload_ids":[],"could_be_tasks_list":false,"found_formats":[],"data_columns":[],"task_ids":[1,2]}' + headers: + Allow: + - POST, OPTIONS + Content-Language: + - en-us + Content-Length: + - '193' + Content-Type: + - application/json + Date: + - Wed, 05 Oct 2022 16:26:14 GMT + Referrer-Policy: + - same-origin + Server: + - WSGIServer/0.2 CPython/3.8.13 + Set-Cookie: + - sessionid=eyJ1aWQiOiJjYjYxOWVmYi05ZDU1LTQzNWYtOGQ4Ni00ZjcyZGJjMDM2ZTYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1og7Dq:qxIxdgId2dOfw5lhYIjhXa3XGZd91f5GTyNXBnwFm_w; + expires=Wed, 19 Oct 2022 16:26:14 GMT; HttpOnly; Max-Age=1209600; Path=/; + SameSite=Lax + Vary: + - Accept-Language, Cookie, Origin + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 201 + message: Created +version: 1