mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-17 19:14:36 +00:00
feat: add file-based access permissions for SharePoint ingest (#1628)
This PR: - defines rbac_data as a SourceMetadata field, - manages connections to an external api for obtaining rbac data with ConnectorRBAC class, - serializes rbac data and saves it to the disk, - matches the rbac_data in the disk to each IngestDoc, using a common field, - forwards rbac data to Elements, via the partition() function To test the changes, run `examples/ingest/sharepoint/ingest.sh` with the relevant rbac & connector credentials --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
This commit is contained in:
parent
3ec3673d34
commit
94836cfad4
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@ -288,6 +288,9 @@ jobs:
|
||||
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
||||
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
||||
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
||||
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
|
||||
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
|
||||
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
|
||||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
||||
|
@ -85,6 +85,9 @@ jobs:
|
||||
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
||||
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
||||
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
||||
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
|
||||
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
|
||||
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
|
||||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
||||
|
@ -8,9 +8,11 @@
|
||||
### Features
|
||||
* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
|
||||
|
||||
* **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list items
|
||||
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list item
|
||||
* **Fixes duplicated elements** Fixes issue where elements are duplicated when embeddings are generated. This will allow users to generate embeddings for their list of Elements without duplicating/breaking the orginal content.
|
||||
* **Fixes failure when flagging for embeddings through unstructured-ingest** Currently adding the embedding parameter to any connector results in a failure on the copy stage. This is resolves the issue by adding the IngestDoc to the context map in the embedding node's `run` method. This allows users to specify that connectors fetch embeddings without failure.
|
||||
* **Fix ingest pipeline reformat nodes not discoverable** Fixes issue where reformat nodes raise ModuleNotFoundError on import. This was due to the directory was missing `__init__.py` in order to make it discoverable.
|
||||
|
@ -22,6 +22,9 @@ Run Locally
|
||||
--client-id "<Microsoft Sharepoint app client-id>" \
|
||||
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
||||
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
||||
--permissions-application-id "<Microsoft Graph API application id, to process per-file access permissions>" \
|
||||
--permissions-client-cred "<Microsoft Graph API application credentials, to process per-file access permissions>" \
|
||||
--permissions-tenant "<e.g https://contoso.onmicrosoft.com (tenant URL) to process per-file access permissions>" \
|
||||
--files-only "Flag to process only files within the site(s)" \
|
||||
--output-dir sharepoint-ingest-output \
|
||||
--num-processes 2 \
|
||||
@ -46,6 +49,10 @@ Run Locally
|
||||
client_id="<Microsoft Sharepoint app client-id>",
|
||||
client_cred="<Microsoft Sharepoint app client-secret>",
|
||||
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
||||
# Credentials to process data about permissions (rbac) within the tenant
|
||||
permissions_application_id="<Microsoft Graph API application id>",
|
||||
permissions_client_cred="<Microsoft Graph API application credentials>",
|
||||
permissions_tenant="<e.g https://contoso.onmicrosoft.com to process permission info within tenant>",
|
||||
# Flag to process only files within the site(s)
|
||||
files_only=True,
|
||||
path="Shared Documents",
|
||||
@ -68,6 +75,9 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
--client-id "<Microsoft Sharepoint app client-id>" \
|
||||
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
||||
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
||||
--permissions-application-id "<Microsoft Graph API application id, to process per-file access permissions>" \
|
||||
--permissions-client-cred "<Microsoft Graph API application credentials, to process per-file access permissions>" \
|
||||
--permissions-tenant "<e.g https://contoso.onmicrosoft.com (tenant URL) to process per-file access permissions>" \
|
||||
--files-only "Flag to process only files within the site(s)" \
|
||||
--output-dir sharepoint-ingest-output \
|
||||
--num-processes 2 \
|
||||
@ -98,6 +108,10 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
||||
client_id="<Microsoft Sharepoint app client-id>",
|
||||
client_cred="<Microsoft Sharepoint app client-secret>",
|
||||
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
||||
# Credentials to process data about permissions (rbac) within the tenant
|
||||
permissions_application_id="<Microsoft Graph API application id>",
|
||||
permissions_client_cred="<Microsoft Graph API application credentials>",
|
||||
permissions_tenant="<e.g https://contoso.onmicrosoft.com to process permission info within tenant>",
|
||||
# Flag to process only files within the site(s)
|
||||
files_only=True,
|
||||
path="Shared Documents",
|
||||
|
5
examples/ingest/sharepoint/ingest.sh
Normal file → Executable file
5
examples/ingest/sharepoint/ingest.sh
Normal file → Executable file
@ -12,6 +12,8 @@
|
||||
# To get the credentials for your Sharepoint app, follow these steps:
|
||||
# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
|
||||
|
||||
# To optionally set up your application and obtain permissions related variables (--permissions-application-id, --permissions-client-cred, --permissions-tenant), follow these steps:
|
||||
# https://tsmatz.wordpress.com/2016/10/07/application-permission-with-v2-endpoint-and-microsoft-graph
|
||||
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
@ -22,6 +24,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--client-id "<Microsoft Sharepoint app client-id>" \
|
||||
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
||||
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
||||
--permissions-application-id "<Microsoft Graph API application id to process per-file access permissions>" \
|
||||
--permissions-client-cred "<Microsoft Graph API application credentials to process per-file access permissions>" \
|
||||
--permissions-tenant "<e.g https://contoso.onmicrosoft.com to process per-file access permissions>" \
|
||||
--files-only "Flag to process only files within the site(s)" \
|
||||
--output-dir sharepoint-ingest-output \
|
||||
--num-processes 2 \
|
||||
|
@ -46,6 +46,7 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
||||
elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then
|
||||
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT"
|
||||
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT"
|
||||
"$SCRIPT_DIR"/clean-permissions-files.sh "$OUTPUT_DIR_TEXT"
|
||||
diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt
|
||||
cat outputdiff.txt
|
||||
diffstat -c outputdiff.txt
|
||||
|
27
test_unstructured_ingest/clean-permissions-files.sh
Executable file
27
test_unstructured_ingest/clean-permissions-files.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Description: Delete (cleanup) permissions files in a folder, so that they are not included in
|
||||
# text diff tests.
|
||||
#
|
||||
# Arguments:
|
||||
# - $1: Name of the folder to do the cleanup operation in.
|
||||
|
||||
set +e
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Please provide a folder to clean the files in: $0 <folder_path>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
folder_path="$1"
|
||||
if [ ! -d "$folder_path" ]; then
|
||||
echo "'$folder_path' is not a directory. Please provide a folder / directory."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for file in "$folder_path"/*_SEP_*; do
|
||||
if [ -e "$file" ]; then
|
||||
rm "$file"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Completed cleanup for permissions files"
|
@ -11,7 +11,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:55",
|
||||
"date_modified": "2023-06-16T05:04:55"
|
||||
"date_modified": "2023-06-16T05:04:55",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain",
|
||||
@ -33,7 +121,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:55",
|
||||
"date_modified": "2023-06-16T05:04:55"
|
||||
"date_modified": "2023-06-16T05:04:55",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain",
|
||||
@ -55,7 +231,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:55",
|
||||
"date_modified": "2023-06-16T05:04:55"
|
||||
"date_modified": "2023-06-16T05:04:55",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain",
|
||||
@ -77,7 +341,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:55",
|
||||
"date_modified": "2023-06-16T05:04:55"
|
||||
"date_modified": "2023-06-16T05:04:55",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain",
|
||||
@ -99,7 +451,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:55",
|
||||
"date_modified": "2023-06-16T05:04:55"
|
||||
"date_modified": "2023-06-16T05:04:55",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain",
|
||||
@ -121,7 +561,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:55",
|
||||
"date_modified": "2023-06-16T05:04:55"
|
||||
"date_modified": "2023-06-16T05:04:55",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain",
|
||||
|
@ -11,7 +11,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:04:47",
|
||||
"date_modified": "2023-06-16T05:04:47"
|
||||
"date_modified": "2023-06-16T05:04:47",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
|
@ -11,7 +11,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:05:05",
|
||||
"date_modified": "2023-06-16T05:05:05"
|
||||
"date_modified": "2023-06-16T05:05:05",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "stanley-cups.xlsx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
@ -36,7 +124,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:05:05",
|
||||
"date_modified": "2023-06-16T05:05:05"
|
||||
"date_modified": "2023-06-16T05:05:05",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "stanley-cups.xlsx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
@ -61,7 +237,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:05:05",
|
||||
"date_modified": "2023-06-16T05:05:05"
|
||||
"date_modified": "2023-06-16T05:05:05",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "stanley-cups.xlsx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
@ -86,7 +350,95 @@
|
||||
"site_url": "https://unstructuredio.sharepoint.com"
|
||||
},
|
||||
"date_created": "2023-06-16T05:05:05",
|
||||
"date_modified": "2023-06-16T05:05:05"
|
||||
"date_modified": "2023-06-16T05:05:05",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Owners",
|
||||
"id": "3",
|
||||
"loginName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Owners"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"roles": [
|
||||
"read"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Visitors",
|
||||
"id": "4",
|
||||
"loginName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Visitors"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"roles": [
|
||||
"write"
|
||||
],
|
||||
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||
"grantedToV2": {
|
||||
"siteGroup": {
|
||||
"displayName": "Communication site Members",
|
||||
"id": "5",
|
||||
"loginName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Communication site Members"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
},
|
||||
{
|
||||
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"roles": [
|
||||
"owner"
|
||||
],
|
||||
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||
"grantedToV2": {
|
||||
"group": {
|
||||
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
},
|
||||
"siteUser": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "7",
|
||||
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"grantedTo": {
|
||||
"user": {
|
||||
"displayName": "Global Administrator",
|
||||
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||
}
|
||||
},
|
||||
"inheritedFrom": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"filename": "stanley-cups.xlsx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
|
@ -19,6 +19,11 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
|
||||
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "$OPENAI_API_KEY" ]; then
|
||||
echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set."
|
||||
exit 0
|
||||
@ -85,6 +90,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--client-cred "$SHAREPOINT_CRED" \
|
||||
--client-id "$SHAREPOINT_CLIENT_ID" \
|
||||
--site "$SHAREPOINT_SITE" \
|
||||
--permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \
|
||||
--permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \
|
||||
--permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \
|
||||
--path "Shared Documents" \
|
||||
--recursive \
|
||||
--embedding-api-key "$OPENAI_API_KEY" \
|
||||
|
@ -26,6 +26,12 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
|
||||
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
|
||||
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
sharepoint \
|
||||
@ -40,6 +46,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--client-cred "$SHAREPOINT_CRED" \
|
||||
--client-id "$SHAREPOINT_CLIENT_ID" \
|
||||
--site "$SHAREPOINT_SITE" \
|
||||
--permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \
|
||||
--permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \
|
||||
--permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \
|
||||
--path "Shared Documents" \
|
||||
--recursive \
|
||||
--work-dir "$WORK_DIR"
|
||||
|
@ -45,6 +45,7 @@ class DataSourceMetadata:
|
||||
date_created: Optional[str] = None
|
||||
date_modified: Optional[str] = None
|
||||
date_processed: Optional[str] = None
|
||||
permissions_data: Optional[List[Dict[str, Any]]] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {key: value for key, value in self.__dict__.items() if value is not None}
|
||||
|
@ -11,6 +11,7 @@ from unstructured.ingest.interfaces import (
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
PartitionConfig,
|
||||
PermissionsConfig,
|
||||
ProcessorConfig,
|
||||
ReadConfig,
|
||||
)
|
||||
@ -287,12 +288,12 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
|
||||
):
|
||||
"""
|
||||
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
|
||||
This allows CLI arguments to be prepended with chunk_ during CLI invocation but
|
||||
This allows CLI arguments to be prepended with embedding_ during CLI invocation but
|
||||
doesn't require that as part of the field names in this class
|
||||
"""
|
||||
if isinstance(kvs, dict):
|
||||
new_kvs = {
|
||||
k[len("embedding-") :]: v # noqa: E203
|
||||
k[len("embedding_") :]: v # noqa: E203
|
||||
for k, v in kvs.items()
|
||||
if k.startswith("embedding_")
|
||||
}
|
||||
@ -363,3 +364,75 @@ class CliChunkingConfig(ChunkingConfig, CliMixin):
|
||||
return None
|
||||
return _decode_dataclass(cls, new_kvs, infer_missing)
|
||||
return _decode_dataclass(cls, kvs, infer_missing)
|
||||
|
||||
|
||||
class CliPermissionsConfig(PermissionsConfig, CliMixin):
|
||||
@staticmethod
|
||||
def add_cli_options(cmd: click.Command) -> None:
|
||||
options = [
|
||||
click.Option(
|
||||
["--permissions-application-id"],
|
||||
type=str,
|
||||
help="Microsoft Graph API application id",
|
||||
),
|
||||
click.Option(
|
||||
["--permissions-client-cred"],
|
||||
type=str,
|
||||
help="Microsoft Graph API application credentials",
|
||||
),
|
||||
click.Option(
|
||||
["--permissions-tenant"],
|
||||
type=str,
|
||||
help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
|
||||
),
|
||||
]
|
||||
cmd.params.extend(options)
|
||||
|
||||
@classmethod
|
||||
def from_dict(
|
||||
cls,
|
||||
kvs: Json,
|
||||
*,
|
||||
infer_missing=False,
|
||||
):
|
||||
"""
|
||||
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
|
||||
This allows CLI arguments to be prepended with permissions_ during CLI invocation but
|
||||
doesn't require that as part of the field names in this class. It also checks if the
|
||||
CLI params are provided as intended.
|
||||
"""
|
||||
|
||||
if (
|
||||
isinstance(kvs, dict)
|
||||
and any(
|
||||
[
|
||||
kvs["permissions_application_id"]
|
||||
or kvs["permissions_client_cred"]
|
||||
or kvs["permissions_tenant"],
|
||||
],
|
||||
)
|
||||
and not all(
|
||||
[
|
||||
kvs["permissions_application_id"]
|
||||
and kvs["permissions_client_cred"]
|
||||
and kvs["permissions_tenant"],
|
||||
],
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
"Please provide either none or all of the following optional values:\n"
|
||||
"--permissions-application-id\n"
|
||||
"--permissions-client-cred\n"
|
||||
"--permissions-tenant",
|
||||
)
|
||||
|
||||
if isinstance(kvs, dict):
|
||||
new_kvs = {
|
||||
k[len("permissions_") :]: v # noqa: E203
|
||||
for k, v in kvs.items()
|
||||
if k.startswith("permissions_")
|
||||
}
|
||||
if len(new_kvs.keys()) == 0:
|
||||
return None
|
||||
return _decode_dataclass(cls, new_kvs, infer_missing)
|
||||
return _decode_dataclass(cls, kvs, infer_missing)
|
||||
|
@ -8,6 +8,7 @@ from unstructured.ingest.cli.interfaces import (
|
||||
CliEmbeddingConfig,
|
||||
CliMixin,
|
||||
CliPartitionConfig,
|
||||
CliPermissionsConfig,
|
||||
CliProcessorConfig,
|
||||
CliReadConfig,
|
||||
)
|
||||
@ -39,6 +40,7 @@ def extract_configs(
|
||||
"embedding_config": CliEmbeddingConfig.from_dict(data),
|
||||
"chunking_config": CliChunkingConfig.from_dict(data),
|
||||
"processor_config": CliProcessorConfig.from_dict(data),
|
||||
"permissions_config": CliPermissionsConfig.from_dict(data),
|
||||
}
|
||||
for v in validate:
|
||||
v.from_dict(data)
|
||||
@ -52,6 +54,7 @@ def add_options(cmd: click.Command, extras=t.List[t.Type[CliMixin]]) -> click.Co
|
||||
CliEmbeddingConfig,
|
||||
CliChunkingConfig,
|
||||
CliProcessorConfig,
|
||||
CliPermissionsConfig,
|
||||
]
|
||||
configs.extend(extras)
|
||||
for config in configs:
|
||||
|
@ -1,3 +1,5 @@
|
||||
import json
|
||||
import os
|
||||
import typing as t
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
@ -15,6 +17,7 @@ from unstructured.ingest.interfaces import (
|
||||
SourceConnectorCleanupMixin,
|
||||
SourceMetadata,
|
||||
)
|
||||
from unstructured.ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
@ -35,6 +38,7 @@ class SimpleSharepointConfig(BaseConnectorConfig):
|
||||
path: str
|
||||
process_pages: bool = False
|
||||
recursive: bool = False
|
||||
permissions_config: t.Optional[SharepointPermissionsConfig] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if not (self.client_id and self.client_credential and self.site_url):
|
||||
@ -57,6 +61,14 @@ class SimpleSharepointConfig(BaseConnectorConfig):
|
||||
raise
|
||||
return site_client
|
||||
|
||||
def get_permissions_client(self):
|
||||
try:
|
||||
permissions_connector = SharepointPermissionsConnector(self.permissions_config)
|
||||
assert permissions_connector.access_token
|
||||
return permissions_connector
|
||||
except Exception as e:
|
||||
logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
@ -122,7 +134,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
file = site_client.web.get_file_by_server_relative_url(self.server_path)
|
||||
if properties_only:
|
||||
file = file.get().execute_query()
|
||||
|
||||
except ClientRequestException as e:
|
||||
if e.response.status_code == 404:
|
||||
return None
|
||||
@ -144,6 +155,44 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
return None
|
||||
return page
|
||||
|
||||
def update_permissions_data(self):
|
||||
def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath):
|
||||
permissions_filename = permissions_filename.split("_SEP_")
|
||||
ingest_doc_filepath = ingest_doc_filepath.split("/")
|
||||
|
||||
if parent_type == "sites":
|
||||
return permissions_filename[0] == ingest_doc_filepath[1]
|
||||
|
||||
elif parent_type == "SitePages" or parent_type == "Shared Documents":
|
||||
return True
|
||||
|
||||
permissions_data = None
|
||||
permissions_dir = Path(self.processor_config.output_dir) / "permissions_data"
|
||||
|
||||
if permissions_dir.is_dir():
|
||||
parent_type = self.file_path.split("/")[0]
|
||||
|
||||
if parent_type == "sites":
|
||||
read_dir = permissions_dir / "sites"
|
||||
elif parent_type == "SitePages" or parent_type == "Shared Documents":
|
||||
read_dir = permissions_dir / "other"
|
||||
else:
|
||||
read_dir = permissions_dir / "other"
|
||||
|
||||
for filename in os.listdir(read_dir):
|
||||
permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1]
|
||||
ingestdoc_docname = self.file_path.split("/")[-1]
|
||||
|
||||
if ingestdoc_docname == permissions_docname and parent_name_matches(
|
||||
parent_type=parent_type,
|
||||
permissions_filename=filename,
|
||||
ingest_doc_filepath=self.file_path,
|
||||
):
|
||||
with open(read_dir / filename) as f:
|
||||
permissions_data = json.loads(f.read())
|
||||
|
||||
return permissions_data
|
||||
|
||||
def update_source_metadata(self, **kwargs):
|
||||
if self.is_page:
|
||||
page = self._fetch_page()
|
||||
@ -158,6 +207,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
version=page.get_property("Version", ""),
|
||||
source_url=page.absolute_url,
|
||||
exists=True,
|
||||
permissions_data=self.update_permissions_data()
|
||||
if self.connector_config.permissions_config
|
||||
else None,
|
||||
)
|
||||
return
|
||||
|
||||
@ -176,6 +228,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
version=file.major_version,
|
||||
source_url=file.properties.get("LinkingUrl", None),
|
||||
exists=True,
|
||||
permissions_data=self.update_permissions_data()
|
||||
if self.connector_config.permissions_config
|
||||
else None,
|
||||
)
|
||||
|
||||
def _download_page(self):
|
||||
@ -317,6 +372,12 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
||||
|
||||
def get_ingest_docs(self):
|
||||
base_site_client = self.connector_config.get_site_client()
|
||||
|
||||
if self.connector_config.permissions_config:
|
||||
permissions_client = self.connector_config.get_permissions_client()
|
||||
if permissions_client:
|
||||
permissions_client.write_all_permissions(self.processor_config.output_dir)
|
||||
|
||||
if not base_site_client.is_tenant:
|
||||
return self._ingest_site_docs(base_site_client)
|
||||
tenant = base_site_client.tenant
|
||||
@ -328,3 +389,166 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
||||
site_client = self.connector_config.get_site_client(site_url)
|
||||
ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
|
||||
return ingest_docs
|
||||
|
||||
|
||||
@dataclass
|
||||
class SharepointPermissionsConnector:
|
||||
def __init__(self, permissions_config):
|
||||
self.permissions_config: SharepointPermissionsConfig = permissions_config
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
self.access_token: str = self.get_access_token()
|
||||
|
||||
@requires_dependencies(["requests"], extras="sharepoint")
|
||||
def get_access_token(self) -> str:
|
||||
import requests
|
||||
|
||||
url = (
|
||||
f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token"
|
||||
)
|
||||
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
data = {
|
||||
"client_id": self.permissions_config.application_id,
|
||||
"scope": "https://graph.microsoft.com/.default",
|
||||
"client_secret": self.permissions_config.client_cred,
|
||||
"grant_type": "client_credentials",
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=data)
|
||||
return response.json()["access_token"]
|
||||
|
||||
def validated_response(self, response):
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
print(f"Request failed with status code {response.status_code}:")
|
||||
print(response.text)
|
||||
|
||||
@requires_dependencies(["requests"], extras="sharepoint")
|
||||
def get_sites(self):
|
||||
import requests
|
||||
|
||||
url = "https://graph.microsoft.com/v1.0/sites"
|
||||
params = {
|
||||
"$select": "webUrl, id",
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.access_token}",
|
||||
}
|
||||
|
||||
response = requests.get(url, params=params, headers=headers)
|
||||
return self.validated_response(response)
|
||||
|
||||
@requires_dependencies(["requests"], extras="sharepoint")
|
||||
def get_drives(self, site):
|
||||
import requests
|
||||
|
||||
url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives"
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.access_token}",
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
return self.validated_response(response)
|
||||
|
||||
@requires_dependencies(["requests"], extras="sharepoint")
|
||||
def get_drive_items(self, site, drive_id):
|
||||
import requests
|
||||
|
||||
url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children"
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.access_token}",
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
return self.validated_response(response)
|
||||
|
||||
def extract_site_name_from_weburl(self, weburl):
|
||||
split_path = urlparse(weburl).path.lstrip("/").split("/")
|
||||
|
||||
if split_path[0] == "sites":
|
||||
return "sites", split_path[1]
|
||||
|
||||
elif split_path[0] == "Shared%20Documents":
|
||||
return "Shared Documents", "Shared Documents"
|
||||
|
||||
elif split_path[0] == "personal":
|
||||
return "Personal", "Personal"
|
||||
|
||||
elif split_path[0] == "_layouts":
|
||||
return "layouts", "layouts"
|
||||
|
||||
# if other weburl structures are found, additional logic might need to be implemented
|
||||
|
||||
logger.warning(
|
||||
"""Couldn't extract sitename, unknown site or parent type. Skipping permissions
|
||||
ingestion for the document with the URL:""",
|
||||
weburl,
|
||||
)
|
||||
|
||||
return None, None
|
||||
|
||||
@requires_dependencies(["requests"], extras="sharepoint")
|
||||
def get_permissions_for_drive_item(self, site, drive_id, item_id):
|
||||
import requests
|
||||
|
||||
url = f"https://graph.microsoft.com/v1.0/sites/ \
|
||||
{site}/drives/{drive_id}/items/{item_id}/permissions"
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.access_token}",
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
return self.validated_response(response)
|
||||
|
||||
def write_all_permissions(self, output_dir):
|
||||
sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]]
|
||||
drive_ids = []
|
||||
|
||||
print("Obtaining drive data for sites for permissions (rbac)")
|
||||
for site_id, site_url in sites:
|
||||
drives = self.get_drives(site_id)
|
||||
if drives:
|
||||
drives_for_site = drives["value"]
|
||||
drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site])
|
||||
|
||||
print("Obtaining item data from drives for permissions (rbac)")
|
||||
item_ids = []
|
||||
for site, drive_id in drive_ids:
|
||||
drive_items = self.get_drive_items(site, drive_id)
|
||||
if drive_items:
|
||||
item_ids.extend(
|
||||
[
|
||||
(site, drive_id, item["id"], item["name"], item["webUrl"])
|
||||
for item in drive_items["value"]
|
||||
],
|
||||
)
|
||||
|
||||
permissions_dir = Path(output_dir) / "permissions_data"
|
||||
|
||||
print("Writing permissions data to disk")
|
||||
for site, drive_id, item_id, item_name, item_web_url in item_ids:
|
||||
res = self.get_permissions_for_drive_item(site, drive_id, item_id)
|
||||
if res:
|
||||
parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url)
|
||||
|
||||
if parent_type == "sites":
|
||||
write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json"
|
||||
|
||||
elif parent_type == "Personal" or parent_type == "Shared Documents":
|
||||
write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
|
||||
else:
|
||||
write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
|
||||
|
||||
if not Path(os.path.dirname(write_path)).is_dir():
|
||||
os.makedirs(os.path.dirname(write_path))
|
||||
|
||||
with open(write_path, "w") as f:
|
||||
json.dump(res["value"], f)
|
||||
|
@ -107,6 +107,14 @@ class ChunkingConfig(BaseConfig):
|
||||
return elements
|
||||
|
||||
|
||||
@dataclass
|
||||
class PermissionsConfig(BaseConfig):
|
||||
application_id: t.Optional[str]
|
||||
client_cred: t.Optional[str]
|
||||
tenant: t.Optional[str]
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class WriteConfig(BaseConfig):
|
||||
pass
|
||||
@ -123,6 +131,7 @@ class SourceMetadata(DataClassJsonMixin, ABC):
|
||||
version: t.Optional[str] = None
|
||||
source_url: t.Optional[str] = None
|
||||
exists: t.Optional[bool] = None
|
||||
permissions_data: t.Optional[t.List[t.Dict[str, t.Any]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -212,6 +221,13 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
|
||||
the version of the document."""
|
||||
return self.source_metadata.version # type: ignore
|
||||
|
||||
@property
|
||||
def permissions_data(self) -> t.Optional[t.List[t.Dict[str, t.Any]]]:
|
||||
"""Access control data, aka permissions or sharing, from the source system."""
|
||||
if self.source_metadata is None:
|
||||
self.update_source_metadata()
|
||||
return self.source_metadata.permissions_data # type: ignore
|
||||
|
||||
@abstractmethod
|
||||
def cleanup_file(self):
|
||||
"""Removes the local copy the file (or anything else) after successful processing."""
|
||||
@ -240,6 +256,12 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
|
||||
"""Sets the SourceMetadata and the properties for the doc"""
|
||||
self._source_metadata = SourceMetadata()
|
||||
|
||||
def update_permissions_data(self):
|
||||
"""Sets the _permissions_data property for the doc.
|
||||
This property is later used to fill the corresponding SourceMetadata.permissions_data field,
|
||||
and after that carries on to the permissions_data property."""
|
||||
self._permissions_data: t.Optional[t.List[t.Dict]] = None
|
||||
|
||||
# NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
|
||||
# in addition to or instead of get_file()
|
||||
@abstractmethod
|
||||
@ -269,6 +291,7 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
|
||||
date_created=self.date_created,
|
||||
date_modified=self.date_modified,
|
||||
date_processed=self.date_processed,
|
||||
permissions_data=self.permissions_data,
|
||||
),
|
||||
**partition_kwargs,
|
||||
)
|
||||
@ -420,6 +443,38 @@ class SourceConnectorCleanupMixin:
|
||||
os.rmdir(cur_dir)
|
||||
|
||||
|
||||
class PermissionsCleanupMixin:
|
||||
processor_config: ProcessorConfig
|
||||
|
||||
def cleanup_permissions(self, cur_dir=None):
|
||||
def has_no_folders(folder_path):
|
||||
folders = [
|
||||
item
|
||||
for item in os.listdir(folder_path)
|
||||
if os.path.isdir(os.path.join(folder_path, item))
|
||||
]
|
||||
return len(folders) == 0
|
||||
|
||||
"""Recursively clean up downloaded files and directories."""
|
||||
if cur_dir is None:
|
||||
cur_dir = Path(self.processor_config.output_dir, "permissions_data")
|
||||
if cur_dir is None:
|
||||
return
|
||||
if Path(cur_dir).is_file():
|
||||
cur_file = cur_dir
|
||||
os.remove(cur_file)
|
||||
return
|
||||
sub_dirs = os.listdir(cur_dir)
|
||||
os.chdir(cur_dir)
|
||||
for sub_dir in sub_dirs:
|
||||
# don't traverse symlinks, not that there every should be any
|
||||
if not os.path.islink(sub_dir):
|
||||
self.cleanup_permissions(sub_dir)
|
||||
os.chdir("..")
|
||||
if has_no_folders(cur_dir):
|
||||
os.rmdir(cur_dir)
|
||||
|
||||
|
||||
class IngestDocCleanupMixin:
|
||||
read_config: ReadConfig
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
from .doc_factory import DocFactory
|
||||
from .interfaces import PipelineContext, ReformatNode
|
||||
from .partition import Partitioner
|
||||
from .permissions import PermissionsDataCleaner
|
||||
from .pipeline import Pipeline
|
||||
from .reformat.chunking import Chunker
|
||||
from .reformat.embedding import Embedder
|
||||
@ -17,4 +18,5 @@ __all__ = [
|
||||
"Writer",
|
||||
"Chunker",
|
||||
"ReformatNode",
|
||||
"PermissionsDataCleaner",
|
||||
]
|
||||
|
@ -212,3 +212,18 @@ class CopyNode(PipelineNode):
|
||||
@abstractmethod
|
||||
def run(self, json_path: str):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class PermissionsNode(PipelineNode):
|
||||
"""
|
||||
Encapsulated logic to do operations on permissions related data.
|
||||
"""
|
||||
|
||||
def initialize(self):
|
||||
logger.info("Running permissions node to cleanup the permissions folder")
|
||||
super().initialize()
|
||||
|
||||
@abstractmethod
|
||||
def run(self):
|
||||
pass
|
||||
|
12
unstructured/ingest/pipeline/permissions.py
Normal file
12
unstructured/ingest/pipeline/permissions.py
Normal file
@ -0,0 +1,12 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from unstructured.ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
|
||||
from unstructured.ingest.pipeline.interfaces import PermissionsNode
|
||||
|
||||
|
||||
@dataclass
|
||||
class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
|
||||
processor_config: ProcessorConfig
|
||||
|
||||
def run(self):
|
||||
self.cleanup_permissions()
|
@ -15,6 +15,7 @@ from unstructured.ingest.pipeline.interfaces import (
|
||||
SourceNode,
|
||||
WriteNode,
|
||||
)
|
||||
from unstructured.ingest.pipeline.permissions import PermissionsDataCleaner
|
||||
from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
|
||||
|
||||
|
||||
@ -26,6 +27,7 @@ class Pipeline(DataClassJsonMixin):
|
||||
partition_node: PartitionNode
|
||||
write_node: t.Optional[WriteNode] = None
|
||||
reformat_nodes: t.List[ReformatNode] = field(default_factory=list)
|
||||
permissions_node: t.Optional[PermissionsDataCleaner] = None
|
||||
|
||||
def initialize(self):
|
||||
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
||||
@ -79,3 +81,6 @@ class Pipeline(DataClassJsonMixin):
|
||||
|
||||
if self.write_node:
|
||||
self.write_node(iterable=partitioned_jsons)
|
||||
|
||||
if self.permissions_node:
|
||||
self.permissions_node.cleanup_permissions()
|
||||
|
@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import (
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
PartitionConfig,
|
||||
PermissionsConfig,
|
||||
ProcessorConfig,
|
||||
)
|
||||
from unstructured.ingest.pipeline import (
|
||||
@ -15,6 +16,7 @@ from unstructured.ingest.pipeline import (
|
||||
DocFactory,
|
||||
Embedder,
|
||||
Partitioner,
|
||||
PermissionsDataCleaner,
|
||||
Pipeline,
|
||||
PipelineContext,
|
||||
Reader,
|
||||
@ -33,6 +35,7 @@ def process_documents(
|
||||
dest_doc_connector: t.Optional[BaseDestinationConnector] = None,
|
||||
chunking_config: t.Optional[ChunkingConfig] = None,
|
||||
embedder_config: t.Optional[EmbeddingConfig] = None,
|
||||
permissions_config: t.Optional[PermissionsConfig] = None,
|
||||
) -> None:
|
||||
pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
|
||||
doc_factory = DocFactory(
|
||||
@ -64,6 +67,11 @@ def process_documents(
|
||||
if dest_doc_connector
|
||||
else None
|
||||
)
|
||||
permissions_data_cleaner = (
|
||||
PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config)
|
||||
if permissions_config
|
||||
else None
|
||||
)
|
||||
pipeline = Pipeline(
|
||||
pipeline_context=pipeline_config,
|
||||
doc_factory_node=doc_factory,
|
||||
@ -71,5 +79,6 @@ def process_documents(
|
||||
partition_node=partitioner,
|
||||
reformat_nodes=reformat_nodes,
|
||||
write_node=writer,
|
||||
permissions_node=permissions_data_cleaner,
|
||||
)
|
||||
pipeline.run()
|
||||
|
@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import (
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
PartitionConfig,
|
||||
PermissionsConfig,
|
||||
ProcessorConfig,
|
||||
ReadConfig,
|
||||
)
|
||||
@ -24,6 +25,7 @@ class Runner(ABC):
|
||||
writer_kwargs: t.Optional[dict] = None
|
||||
embedding_config: t.Optional[EmbeddingConfig] = None
|
||||
chunking_config: t.Optional[ChunkingConfig] = None
|
||||
permissions_config: t.Optional[PermissionsConfig] = None
|
||||
|
||||
@abstractmethod
|
||||
def run(self, *args, **kwargs):
|
||||
@ -36,6 +38,18 @@ class Runner(ABC):
|
||||
return writer(**writer_kwargs)
|
||||
return None
|
||||
|
||||
def get_permissions_config(self) -> t.Optional[PermissionsConfig]:
|
||||
if self.permissions_config is None:
|
||||
return None
|
||||
|
||||
permissions_config_filled = bool(
|
||||
self.permissions_config.application_id
|
||||
and self.permissions_config.client_cred
|
||||
and self.permissions_config.tenant,
|
||||
)
|
||||
|
||||
return self.permissions_config if permissions_config_filled else None
|
||||
|
||||
def process_documents(self, source_doc_connector: BaseSourceConnector):
|
||||
process_documents(
|
||||
processor_config=self.processor_config,
|
||||
@ -44,4 +58,5 @@ class Runner(ABC):
|
||||
dest_doc_connector=self.get_dest_doc_connector(),
|
||||
embedder_config=self.embedding_config,
|
||||
chunking_config=self.chunking_config,
|
||||
permissions_config=self.get_permissions_config(),
|
||||
)
|
||||
|
@ -1,5 +1,6 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||
from unstructured.ingest.runner.base_runner import Runner
|
||||
@ -12,6 +13,9 @@ class SharePointRunner(Runner):
|
||||
site: str,
|
||||
client_id: str,
|
||||
client_cred: str,
|
||||
permissions_application_id: t.Optional[str],
|
||||
permissions_client_cred: t.Optional[str],
|
||||
permissions_tenant: t.Optional[str],
|
||||
path: str,
|
||||
files_only: bool = False,
|
||||
recursive: bool = False,
|
||||
@ -31,10 +35,17 @@ class SharePointRunner(Runner):
|
||||
)
|
||||
|
||||
from unstructured.ingest.connector.sharepoint import (
|
||||
SharepointPermissionsConfig,
|
||||
SharepointSourceConnector,
|
||||
SimpleSharepointConfig,
|
||||
)
|
||||
|
||||
permissions_config = SharepointPermissionsConfig(
|
||||
application_id=permissions_application_id,
|
||||
client_cred=permissions_client_cred,
|
||||
tenant=permissions_tenant,
|
||||
)
|
||||
|
||||
source_doc_connector = SharepointSourceConnector( # type: ignore
|
||||
processor_config=self.processor_config,
|
||||
connector_config=SimpleSharepointConfig(
|
||||
@ -44,6 +55,7 @@ class SharePointRunner(Runner):
|
||||
path=path,
|
||||
process_pages=(not files_only),
|
||||
recursive=recursive,
|
||||
permissions_config=permissions_config,
|
||||
),
|
||||
read_config=self.read_config,
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user