diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dbf3fb1cf..3cb70c60c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -288,6 +288,9 @@ jobs: SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} + SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} + SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} + SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 5067379c3..45e7f67a0 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -85,6 +85,9 @@ jobs: SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} + SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} + SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} + SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 8131d269e..ca337ff4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,11 @@ ### Features * **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function. +* **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector. + ### Fixes -* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list items +* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list item * **Fixes duplicated elements** Fixes issue where elements are duplicated when embeddings are generated. This will allow users to generate embeddings for their list of Elements without duplicating/breaking the orginal content. * **Fixes failure when flagging for embeddings through unstructured-ingest** Currently adding the embedding parameter to any connector results in a failure on the copy stage. This is resolves the issue by adding the IngestDoc to the context map in the embedding node's `run` method. This allows users to specify that connectors fetch embeddings without failure. * **Fix ingest pipeline reformat nodes not discoverable** Fixes issue where reformat nodes raise ModuleNotFoundError on import. This was due to the directory was missing `__init__.py` in order to make it discoverable. diff --git a/docs/source/source_connectors/sharepoint.rst b/docs/source/source_connectors/sharepoint.rst index bc0b144dd..ce5fba4c1 100644 --- a/docs/source/source_connectors/sharepoint.rst +++ b/docs/source/source_connectors/sharepoint.rst @@ -22,6 +22,9 @@ Run Locally --client-id "" \ --client-cred "" \ --site "" \ + --permissions-application-id "" \ + --permissions-client-cred "" \ + --permissions-tenant "" \ --files-only "Flag to process only files within the site(s)" \ --output-dir sharepoint-ingest-output \ --num-processes 2 \ @@ -46,6 +49,10 @@ Run Locally client_id="", client_cred="", site="", + # Credentials to process data about permissions (rbac) within the tenant + permissions_application_id="", + permissions_client_cred="", + permissions_tenant="", # Flag to process only files within the site(s) files_only=True, path="Shared Documents", @@ -68,6 +75,9 @@ You can also use upstream connectors with the ``unstructured`` API. For this you --client-id "" \ --client-cred "" \ --site "" \ + --permissions-application-id "" \ + --permissions-client-cred "" \ + --permissions-tenant "" \ --files-only "Flag to process only files within the site(s)" \ --output-dir sharepoint-ingest-output \ --num-processes 2 \ @@ -98,6 +108,10 @@ You can also use upstream connectors with the ``unstructured`` API. For this you client_id="", client_cred="", site="", + # Credentials to process data about permissions (rbac) within the tenant + permissions_application_id="", + permissions_client_cred="", + permissions_tenant="", # Flag to process only files within the site(s) files_only=True, path="Shared Documents", diff --git a/examples/ingest/sharepoint/ingest.sh b/examples/ingest/sharepoint/ingest.sh old mode 100644 new mode 100755 index 53a121820..2d1f15e46 --- a/examples/ingest/sharepoint/ingest.sh +++ b/examples/ingest/sharepoint/ingest.sh @@ -12,6 +12,8 @@ # To get the credentials for your Sharepoint app, follow these steps: # https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal +# To optionally set up your application and obtain permissions related variables (--permissions-application-id, --permissions-client-cred, --permissions-tenant), follow these steps: +# https://tsmatz.wordpress.com/2016/10/07/application-permission-with-v2-endpoint-and-microsoft-graph SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) @@ -22,6 +24,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --client-id "" \ --client-cred "" \ --site "" \ + --permissions-application-id "" \ + --permissions-client-cred "" \ + --permissions-tenant "" \ --files-only "Flag to process only files within the site(s)" \ --output-dir sharepoint-ingest-output \ --num-processes 2 \ diff --git a/test_unstructured_ingest/check-diff-expected-output.sh b/test_unstructured_ingest/check-diff-expected-output.sh index 3a886d671..e9e9a9092 100755 --- a/test_unstructured_ingest/check-diff-expected-output.sh +++ b/test_unstructured_ingest/check-diff-expected-output.sh @@ -46,6 +46,7 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then "$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT" "$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT" + "$SCRIPT_DIR"/clean-permissions-files.sh "$OUTPUT_DIR_TEXT" diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt cat outputdiff.txt diffstat -c outputdiff.txt diff --git a/test_unstructured_ingest/clean-permissions-files.sh b/test_unstructured_ingest/clean-permissions-files.sh new file mode 100755 index 000000000..5cb6b811b --- /dev/null +++ b/test_unstructured_ingest/clean-permissions-files.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Description: Delete (cleanup) permissions files in a folder, so that they are not included in +# text diff tests. +# +# Arguments: +# - $1: Name of the folder to do the cleanup operation in. + +set +e +if [ "$#" -ne 1 ]; then + echo "Please provide a folder to clean the files in: $0 " + exit 1 +fi + +folder_path="$1" +if [ ! -d "$folder_path" ]; then + echo "'$folder_path' is not a directory. Please provide a folder / directory." + exit 1 +fi + +for file in "$folder_path"/*_SEP_*; do + if [ -e "$file" ]; then + rm "$file" + fi +done + +echo "Completed cleanup for permissions files" diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.txt.json index 07dd9abb2..3300ee7b0 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.txt.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.txt.json @@ -11,7 +11,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:55", - "date_modified": "2023-06-16T05:04:55" + "date_modified": "2023-06-16T05:04:55", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "fake-text.txt", "filetype": "text/plain", @@ -33,7 +121,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:55", - "date_modified": "2023-06-16T05:04:55" + "date_modified": "2023-06-16T05:04:55", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "fake-text.txt", "filetype": "text/plain", @@ -55,7 +231,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:55", - "date_modified": "2023-06-16T05:04:55" + "date_modified": "2023-06-16T05:04:55", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "fake-text.txt", "filetype": "text/plain", @@ -77,7 +341,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:55", - "date_modified": "2023-06-16T05:04:55" + "date_modified": "2023-06-16T05:04:55", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "fake-text.txt", "filetype": "text/plain", @@ -99,7 +451,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:55", - "date_modified": "2023-06-16T05:04:55" + "date_modified": "2023-06-16T05:04:55", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "fake-text.txt", "filetype": "text/plain", @@ -121,7 +561,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:55", - "date_modified": "2023-06-16T05:04:55" + "date_modified": "2023-06-16T05:04:55", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "fake-text.txt", "filetype": "text/plain", diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json index 31e3cba2d..20a30b762 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json @@ -11,7 +11,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:04:47", - "date_modified": "2023-06-16T05:04:47" + "date_modified": "2023-06-16T05:04:47", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "ideas-page.html", "filetype": "text/html", diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json index a9e4e3ca0..6b10cd18c 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json @@ -11,7 +11,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:05:05", - "date_modified": "2023-06-16T05:05:05" + "date_modified": "2023-06-16T05:05:05", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "stanley-cups.xlsx", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -36,7 +124,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:05:05", - "date_modified": "2023-06-16T05:05:05" + "date_modified": "2023-06-16T05:05:05", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "stanley-cups.xlsx", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -61,7 +237,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:05:05", - "date_modified": "2023-06-16T05:05:05" + "date_modified": "2023-06-16T05:05:05", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "stanley-cups.xlsx", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -86,7 +350,95 @@ "site_url": "https://unstructuredio.sharepoint.com" }, "date_created": "2023-06-16T05:05:05", - "date_modified": "2023-06-16T05:05:05" + "date_modified": "2023-06-16T05:05:05", + "permissions_data": [ + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "roles": [ + "owner" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Owners", + "id": "3", + "loginName": "Communication site Owners" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Owners" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "roles": [ + "read" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Visitors", + "id": "4", + "loginName": "Communication site Visitors" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Visitors" + } + }, + "inheritedFrom": {} + }, + { + "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "roles": [ + "write" + ], + "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM", + "grantedToV2": { + "siteGroup": { + "displayName": "Communication site Members", + "id": "5", + "loginName": "Communication site Members" + } + }, + "grantedTo": { + "user": { + "displayName": "Communication site Members" + } + }, + "inheritedFrom": {} + }, + { + "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "roles": [ + "owner" + ], + "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE", + "grantedToV2": { + "group": { + "@odata.type": "#microsoft.graph.sharePointIdentity", + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + }, + "siteUser": { + "displayName": "Global Administrator", + "id": "7", + "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "grantedTo": { + "user": { + "displayName": "Global Administrator", + "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51" + } + }, + "inheritedFrom": {} + } + ] }, "filename": "stanley-cups.xlsx", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", diff --git a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh index aa3f8b91f..fd754a5e9 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh @@ -19,6 +19,11 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then exit 0 fi +if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then + echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." + exit 0 +fi + if [ -z "$OPENAI_API_KEY" ]; then echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set." exit 0 @@ -85,6 +90,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --client-cred "$SHAREPOINT_CRED" \ --client-id "$SHAREPOINT_CLIENT_ID" \ --site "$SHAREPOINT_SITE" \ + --permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \ + --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \ + --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \ --path "Shared Documents" \ --recursive \ --embedding-api-key "$OPENAI_API_KEY" \ diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 8aea377c4..504a3b83e 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -26,6 +26,12 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." exit 0 fi + +if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then + echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." + exit 0 +fi + # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly PYTHONPATH=. ./unstructured/ingest/main.py \ sharepoint \ @@ -40,6 +46,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --client-cred "$SHAREPOINT_CRED" \ --client-id "$SHAREPOINT_CLIENT_ID" \ --site "$SHAREPOINT_SITE" \ + --permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \ + --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \ + --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \ --path "Shared Documents" \ --recursive \ --work-dir "$WORK_DIR" diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 8ccc5ca42..2d130c4df 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -45,6 +45,7 @@ class DataSourceMetadata: date_created: Optional[str] = None date_modified: Optional[str] = None date_processed: Optional[str] = None + permissions_data: Optional[List[Dict[str, Any]]] = None def to_dict(self): return {key: value for key, value in self.__dict__.items() if value is not None} diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 36be232e7..d5d5e04d2 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -11,6 +11,7 @@ from unstructured.ingest.interfaces import ( ChunkingConfig, EmbeddingConfig, PartitionConfig, + PermissionsConfig, ProcessorConfig, ReadConfig, ) @@ -287,12 +288,12 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin): ): """ Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with chunk_ during CLI invocation but + This allows CLI arguments to be prepended with embedding_ during CLI invocation but doesn't require that as part of the field names in this class """ if isinstance(kvs, dict): new_kvs = { - k[len("embedding-") :]: v # noqa: E203 + k[len("embedding_") :]: v # noqa: E203 for k, v in kvs.items() if k.startswith("embedding_") } @@ -363,3 +364,75 @@ class CliChunkingConfig(ChunkingConfig, CliMixin): return None return _decode_dataclass(cls, new_kvs, infer_missing) return _decode_dataclass(cls, kvs, infer_missing) + + +class CliPermissionsConfig(PermissionsConfig, CliMixin): + @staticmethod + def add_cli_options(cmd: click.Command) -> None: + options = [ + click.Option( + ["--permissions-application-id"], + type=str, + help="Microsoft Graph API application id", + ), + click.Option( + ["--permissions-client-cred"], + type=str, + help="Microsoft Graph API application credentials", + ), + click.Option( + ["--permissions-tenant"], + type=str, + help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.", + ), + ] + cmd.params.extend(options) + + @classmethod + def from_dict( + cls, + kvs: Json, + *, + infer_missing=False, + ): + """ + Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. + This allows CLI arguments to be prepended with permissions_ during CLI invocation but + doesn't require that as part of the field names in this class. It also checks if the + CLI params are provided as intended. + """ + + if ( + isinstance(kvs, dict) + and any( + [ + kvs["permissions_application_id"] + or kvs["permissions_client_cred"] + or kvs["permissions_tenant"], + ], + ) + and not all( + [ + kvs["permissions_application_id"] + and kvs["permissions_client_cred"] + and kvs["permissions_tenant"], + ], + ) + ): + raise ValueError( + "Please provide either none or all of the following optional values:\n" + "--permissions-application-id\n" + "--permissions-client-cred\n" + "--permissions-tenant", + ) + + if isinstance(kvs, dict): + new_kvs = { + k[len("permissions_") :]: v # noqa: E203 + for k, v in kvs.items() + if k.startswith("permissions_") + } + if len(new_kvs.keys()) == 0: + return None + return _decode_dataclass(cls, new_kvs, infer_missing) + return _decode_dataclass(cls, kvs, infer_missing) diff --git a/unstructured/ingest/cli/utils.py b/unstructured/ingest/cli/utils.py index e04ce48d1..61df72f40 100644 --- a/unstructured/ingest/cli/utils.py +++ b/unstructured/ingest/cli/utils.py @@ -8,6 +8,7 @@ from unstructured.ingest.cli.interfaces import ( CliEmbeddingConfig, CliMixin, CliPartitionConfig, + CliPermissionsConfig, CliProcessorConfig, CliReadConfig, ) @@ -39,6 +40,7 @@ def extract_configs( "embedding_config": CliEmbeddingConfig.from_dict(data), "chunking_config": CliChunkingConfig.from_dict(data), "processor_config": CliProcessorConfig.from_dict(data), + "permissions_config": CliPermissionsConfig.from_dict(data), } for v in validate: v.from_dict(data) @@ -52,6 +54,7 @@ def add_options(cmd: click.Command, extras=t.List[t.Type[CliMixin]]) -> click.Co CliEmbeddingConfig, CliChunkingConfig, CliProcessorConfig, + CliPermissionsConfig, ] configs.extend(extras) for config in configs: diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index e80bed98b..8fe290b83 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -1,3 +1,5 @@ +import json +import os import typing as t from dataclasses import dataclass, field from datetime import datetime @@ -15,6 +17,7 @@ from unstructured.ingest.interfaces import ( SourceConnectorCleanupMixin, SourceMetadata, ) +from unstructured.ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -35,6 +38,7 @@ class SimpleSharepointConfig(BaseConnectorConfig): path: str process_pages: bool = False recursive: bool = False + permissions_config: t.Optional[SharepointPermissionsConfig] = None def __post_init__(self): if not (self.client_id and self.client_credential and self.site_url): @@ -57,6 +61,14 @@ class SimpleSharepointConfig(BaseConnectorConfig): raise return site_client + def get_permissions_client(self): + try: + permissions_connector = SharepointPermissionsConnector(self.permissions_config) + assert permissions_connector.access_token + return permissions_connector + except Exception as e: + logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e) + @dataclass class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): @@ -122,7 +134,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): file = site_client.web.get_file_by_server_relative_url(self.server_path) if properties_only: file = file.get().execute_query() - except ClientRequestException as e: if e.response.status_code == 404: return None @@ -144,6 +155,44 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): return None return page + def update_permissions_data(self): + def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath): + permissions_filename = permissions_filename.split("_SEP_") + ingest_doc_filepath = ingest_doc_filepath.split("/") + + if parent_type == "sites": + return permissions_filename[0] == ingest_doc_filepath[1] + + elif parent_type == "SitePages" or parent_type == "Shared Documents": + return True + + permissions_data = None + permissions_dir = Path(self.processor_config.output_dir) / "permissions_data" + + if permissions_dir.is_dir(): + parent_type = self.file_path.split("/")[0] + + if parent_type == "sites": + read_dir = permissions_dir / "sites" + elif parent_type == "SitePages" or parent_type == "Shared Documents": + read_dir = permissions_dir / "other" + else: + read_dir = permissions_dir / "other" + + for filename in os.listdir(read_dir): + permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1] + ingestdoc_docname = self.file_path.split("/")[-1] + + if ingestdoc_docname == permissions_docname and parent_name_matches( + parent_type=parent_type, + permissions_filename=filename, + ingest_doc_filepath=self.file_path, + ): + with open(read_dir / filename) as f: + permissions_data = json.loads(f.read()) + + return permissions_data + def update_source_metadata(self, **kwargs): if self.is_page: page = self._fetch_page() @@ -158,6 +207,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): version=page.get_property("Version", ""), source_url=page.absolute_url, exists=True, + permissions_data=self.update_permissions_data() + if self.connector_config.permissions_config + else None, ) return @@ -176,6 +228,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): version=file.major_version, source_url=file.properties.get("LinkingUrl", None), exists=True, + permissions_data=self.update_permissions_data() + if self.connector_config.permissions_config + else None, ) def _download_page(self): @@ -317,6 +372,12 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector def get_ingest_docs(self): base_site_client = self.connector_config.get_site_client() + + if self.connector_config.permissions_config: + permissions_client = self.connector_config.get_permissions_client() + if permissions_client: + permissions_client.write_all_permissions(self.processor_config.output_dir) + if not base_site_client.is_tenant: return self._ingest_site_docs(base_site_client) tenant = base_site_client.tenant @@ -328,3 +389,166 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector site_client = self.connector_config.get_site_client(site_url) ingest_docs = ingest_docs + self._ingest_site_docs(site_client) return ingest_docs + + +@dataclass +class SharepointPermissionsConnector: + def __init__(self, permissions_config): + self.permissions_config: SharepointPermissionsConfig = permissions_config + self.initialize() + + def initialize(self): + self.access_token: str = self.get_access_token() + + @requires_dependencies(["requests"], extras="sharepoint") + def get_access_token(self) -> str: + import requests + + url = ( + f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token" + ) + headers = {"Content-Type": "application/x-www-form-urlencoded"} + data = { + "client_id": self.permissions_config.application_id, + "scope": "https://graph.microsoft.com/.default", + "client_secret": self.permissions_config.client_cred, + "grant_type": "client_credentials", + } + response = requests.post(url, headers=headers, data=data) + return response.json()["access_token"] + + def validated_response(self, response): + if response.status_code == 200: + return response.json() + else: + print(f"Request failed with status code {response.status_code}:") + print(response.text) + + @requires_dependencies(["requests"], extras="sharepoint") + def get_sites(self): + import requests + + url = "https://graph.microsoft.com/v1.0/sites" + params = { + "$select": "webUrl, id", + } + + headers = { + "Authorization": f"Bearer {self.access_token}", + } + + response = requests.get(url, params=params, headers=headers) + return self.validated_response(response) + + @requires_dependencies(["requests"], extras="sharepoint") + def get_drives(self, site): + import requests + + url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives" + + headers = { + "Authorization": f"Bearer {self.access_token}", + } + + response = requests.get(url, headers=headers) + + return self.validated_response(response) + + @requires_dependencies(["requests"], extras="sharepoint") + def get_drive_items(self, site, drive_id): + import requests + + url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children" + + headers = { + "Authorization": f"Bearer {self.access_token}", + } + + response = requests.get(url, headers=headers) + + return self.validated_response(response) + + def extract_site_name_from_weburl(self, weburl): + split_path = urlparse(weburl).path.lstrip("/").split("/") + + if split_path[0] == "sites": + return "sites", split_path[1] + + elif split_path[0] == "Shared%20Documents": + return "Shared Documents", "Shared Documents" + + elif split_path[0] == "personal": + return "Personal", "Personal" + + elif split_path[0] == "_layouts": + return "layouts", "layouts" + + # if other weburl structures are found, additional logic might need to be implemented + + logger.warning( + """Couldn't extract sitename, unknown site or parent type. Skipping permissions + ingestion for the document with the URL:""", + weburl, + ) + + return None, None + + @requires_dependencies(["requests"], extras="sharepoint") + def get_permissions_for_drive_item(self, site, drive_id, item_id): + import requests + + url = f"https://graph.microsoft.com/v1.0/sites/ \ + {site}/drives/{drive_id}/items/{item_id}/permissions" + + headers = { + "Authorization": f"Bearer {self.access_token}", + } + + response = requests.get(url, headers=headers) + + return self.validated_response(response) + + def write_all_permissions(self, output_dir): + sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]] + drive_ids = [] + + print("Obtaining drive data for sites for permissions (rbac)") + for site_id, site_url in sites: + drives = self.get_drives(site_id) + if drives: + drives_for_site = drives["value"] + drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site]) + + print("Obtaining item data from drives for permissions (rbac)") + item_ids = [] + for site, drive_id in drive_ids: + drive_items = self.get_drive_items(site, drive_id) + if drive_items: + item_ids.extend( + [ + (site, drive_id, item["id"], item["name"], item["webUrl"]) + for item in drive_items["value"] + ], + ) + + permissions_dir = Path(output_dir) / "permissions_data" + + print("Writing permissions data to disk") + for site, drive_id, item_id, item_name, item_web_url in item_ids: + res = self.get_permissions_for_drive_item(site, drive_id, item_id) + if res: + parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url) + + if parent_type == "sites": + write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json" + + elif parent_type == "Personal" or parent_type == "Shared Documents": + write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json" + else: + write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json" + + if not Path(os.path.dirname(write_path)).is_dir(): + os.makedirs(os.path.dirname(write_path)) + + with open(write_path, "w") as f: + json.dump(res["value"], f) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 8a0710811..f3deefbb0 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -107,6 +107,14 @@ class ChunkingConfig(BaseConfig): return elements +@dataclass +class PermissionsConfig(BaseConfig): + application_id: t.Optional[str] + client_cred: t.Optional[str] + tenant: t.Optional[str] + pass + + @dataclass class WriteConfig(BaseConfig): pass @@ -123,6 +131,7 @@ class SourceMetadata(DataClassJsonMixin, ABC): version: t.Optional[str] = None source_url: t.Optional[str] = None exists: t.Optional[bool] = None + permissions_data: t.Optional[t.List[t.Dict[str, t.Any]]] = None @dataclass @@ -212,6 +221,13 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC): the version of the document.""" return self.source_metadata.version # type: ignore + @property + def permissions_data(self) -> t.Optional[t.List[t.Dict[str, t.Any]]]: + """Access control data, aka permissions or sharing, from the source system.""" + if self.source_metadata is None: + self.update_source_metadata() + return self.source_metadata.permissions_data # type: ignore + @abstractmethod def cleanup_file(self): """Removes the local copy the file (or anything else) after successful processing.""" @@ -240,6 +256,12 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC): """Sets the SourceMetadata and the properties for the doc""" self._source_metadata = SourceMetadata() + def update_permissions_data(self): + """Sets the _permissions_data property for the doc. + This property is later used to fill the corresponding SourceMetadata.permissions_data field, + and after that carries on to the permissions_data property.""" + self._permissions_data: t.Optional[t.List[t.Dict]] = None + # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods # in addition to or instead of get_file() @abstractmethod @@ -269,6 +291,7 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC): date_created=self.date_created, date_modified=self.date_modified, date_processed=self.date_processed, + permissions_data=self.permissions_data, ), **partition_kwargs, ) @@ -420,6 +443,38 @@ class SourceConnectorCleanupMixin: os.rmdir(cur_dir) +class PermissionsCleanupMixin: + processor_config: ProcessorConfig + + def cleanup_permissions(self, cur_dir=None): + def has_no_folders(folder_path): + folders = [ + item + for item in os.listdir(folder_path) + if os.path.isdir(os.path.join(folder_path, item)) + ] + return len(folders) == 0 + + """Recursively clean up downloaded files and directories.""" + if cur_dir is None: + cur_dir = Path(self.processor_config.output_dir, "permissions_data") + if cur_dir is None: + return + if Path(cur_dir).is_file(): + cur_file = cur_dir + os.remove(cur_file) + return + sub_dirs = os.listdir(cur_dir) + os.chdir(cur_dir) + for sub_dir in sub_dirs: + # don't traverse symlinks, not that there every should be any + if not os.path.islink(sub_dir): + self.cleanup_permissions(sub_dir) + os.chdir("..") + if has_no_folders(cur_dir): + os.rmdir(cur_dir) + + class IngestDocCleanupMixin: read_config: ReadConfig diff --git a/unstructured/ingest/pipeline/__init__.py b/unstructured/ingest/pipeline/__init__.py index 19d78bdbc..439647b60 100644 --- a/unstructured/ingest/pipeline/__init__.py +++ b/unstructured/ingest/pipeline/__init__.py @@ -1,6 +1,7 @@ from .doc_factory import DocFactory from .interfaces import PipelineContext, ReformatNode from .partition import Partitioner +from .permissions import PermissionsDataCleaner from .pipeline import Pipeline from .reformat.chunking import Chunker from .reformat.embedding import Embedder @@ -17,4 +18,5 @@ __all__ = [ "Writer", "Chunker", "ReformatNode", + "PermissionsDataCleaner", ] diff --git a/unstructured/ingest/pipeline/interfaces.py b/unstructured/ingest/pipeline/interfaces.py index 69976b7dc..56f64aecb 100644 --- a/unstructured/ingest/pipeline/interfaces.py +++ b/unstructured/ingest/pipeline/interfaces.py @@ -212,3 +212,18 @@ class CopyNode(PipelineNode): @abstractmethod def run(self, json_path: str): pass + + +@dataclass +class PermissionsNode(PipelineNode): + """ + Encapsulated logic to do operations on permissions related data. + """ + + def initialize(self): + logger.info("Running permissions node to cleanup the permissions folder") + super().initialize() + + @abstractmethod + def run(self): + pass diff --git a/unstructured/ingest/pipeline/permissions.py b/unstructured/ingest/pipeline/permissions.py new file mode 100644 index 000000000..5a93b3cca --- /dev/null +++ b/unstructured/ingest/pipeline/permissions.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass + +from unstructured.ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig +from unstructured.ingest.pipeline.interfaces import PermissionsNode + + +@dataclass +class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin): + processor_config: ProcessorConfig + + def run(self): + self.cleanup_permissions() diff --git a/unstructured/ingest/pipeline/pipeline.py b/unstructured/ingest/pipeline/pipeline.py index 5b4b37ff4..8083a02ec 100644 --- a/unstructured/ingest/pipeline/pipeline.py +++ b/unstructured/ingest/pipeline/pipeline.py @@ -15,6 +15,7 @@ from unstructured.ingest.pipeline.interfaces import ( SourceNode, WriteNode, ) +from unstructured.ingest.pipeline.permissions import PermissionsDataCleaner from unstructured.ingest.pipeline.utils import get_ingest_doc_hash @@ -26,6 +27,7 @@ class Pipeline(DataClassJsonMixin): partition_node: PartitionNode write_node: t.Optional[WriteNode] = None reformat_nodes: t.List[ReformatNode] = field(default_factory=list) + permissions_node: t.Optional[PermissionsDataCleaner] = None def initialize(self): ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO) @@ -79,3 +81,6 @@ class Pipeline(DataClassJsonMixin): if self.write_node: self.write_node(iterable=partitioned_jsons) + + if self.permissions_node: + self.permissions_node.cleanup_permissions() diff --git a/unstructured/ingest/processor.py b/unstructured/ingest/processor.py index 8d9cd7c3e..aaee3f7c1 100644 --- a/unstructured/ingest/processor.py +++ b/unstructured/ingest/processor.py @@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import ( ChunkingConfig, EmbeddingConfig, PartitionConfig, + PermissionsConfig, ProcessorConfig, ) from unstructured.ingest.pipeline import ( @@ -15,6 +16,7 @@ from unstructured.ingest.pipeline import ( DocFactory, Embedder, Partitioner, + PermissionsDataCleaner, Pipeline, PipelineContext, Reader, @@ -33,6 +35,7 @@ def process_documents( dest_doc_connector: t.Optional[BaseDestinationConnector] = None, chunking_config: t.Optional[ChunkingConfig] = None, embedder_config: t.Optional[EmbeddingConfig] = None, + permissions_config: t.Optional[PermissionsConfig] = None, ) -> None: pipeline_config = PipelineContext.from_dict(processor_config.to_dict()) doc_factory = DocFactory( @@ -64,6 +67,11 @@ def process_documents( if dest_doc_connector else None ) + permissions_data_cleaner = ( + PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config) + if permissions_config + else None + ) pipeline = Pipeline( pipeline_context=pipeline_config, doc_factory_node=doc_factory, @@ -71,5 +79,6 @@ def process_documents( partition_node=partitioner, reformat_nodes=reformat_nodes, write_node=writer, + permissions_node=permissions_data_cleaner, ) pipeline.run() diff --git a/unstructured/ingest/runner/base_runner.py b/unstructured/ingest/runner/base_runner.py index 985e80223..dc4a6db94 100644 --- a/unstructured/ingest/runner/base_runner.py +++ b/unstructured/ingest/runner/base_runner.py @@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import ( ChunkingConfig, EmbeddingConfig, PartitionConfig, + PermissionsConfig, ProcessorConfig, ReadConfig, ) @@ -24,6 +25,7 @@ class Runner(ABC): writer_kwargs: t.Optional[dict] = None embedding_config: t.Optional[EmbeddingConfig] = None chunking_config: t.Optional[ChunkingConfig] = None + permissions_config: t.Optional[PermissionsConfig] = None @abstractmethod def run(self, *args, **kwargs): @@ -36,6 +38,18 @@ class Runner(ABC): return writer(**writer_kwargs) return None + def get_permissions_config(self) -> t.Optional[PermissionsConfig]: + if self.permissions_config is None: + return None + + permissions_config_filled = bool( + self.permissions_config.application_id + and self.permissions_config.client_cred + and self.permissions_config.tenant, + ) + + return self.permissions_config if permissions_config_filled else None + def process_documents(self, source_doc_connector: BaseSourceConnector): process_documents( processor_config=self.processor_config, @@ -44,4 +58,5 @@ class Runner(ABC): dest_doc_connector=self.get_dest_doc_connector(), embedder_config=self.embedding_config, chunking_config=self.chunking_config, + permissions_config=self.get_permissions_config(), ) diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py index 744abcb1a..91d47fbc1 100644 --- a/unstructured/ingest/runner/sharepoint.py +++ b/unstructured/ingest/runner/sharepoint.py @@ -1,5 +1,6 @@ import hashlib import logging +import typing as t from unstructured.ingest.logger import ingest_log_streaming_init, logger from unstructured.ingest.runner.base_runner import Runner @@ -12,6 +13,9 @@ class SharePointRunner(Runner): site: str, client_id: str, client_cred: str, + permissions_application_id: t.Optional[str], + permissions_client_cred: t.Optional[str], + permissions_tenant: t.Optional[str], path: str, files_only: bool = False, recursive: bool = False, @@ -31,10 +35,17 @@ class SharePointRunner(Runner): ) from unstructured.ingest.connector.sharepoint import ( + SharepointPermissionsConfig, SharepointSourceConnector, SimpleSharepointConfig, ) + permissions_config = SharepointPermissionsConfig( + application_id=permissions_application_id, + client_cred=permissions_client_cred, + tenant=permissions_tenant, + ) + source_doc_connector = SharepointSourceConnector( # type: ignore processor_config=self.processor_config, connector_config=SimpleSharepointConfig( @@ -44,6 +55,7 @@ class SharePointRunner(Runner): path=path, process_pages=(not files_only), recursive=recursive, + permissions_config=permissions_config, ), read_config=self.read_config, )