mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-22 05:25:29 +00:00
feat: add file-based access permissions for SharePoint ingest (#1628)
This PR: - defines rbac_data as a SourceMetadata field, - manages connections to an external api for obtaining rbac data with ConnectorRBAC class, - serializes rbac data and saves it to the disk, - matches the rbac_data in the disk to each IngestDoc, using a common field, - forwards rbac data to Elements, via the partition() function To test the changes, run `examples/ingest/sharepoint/ingest.sh` with the relevant rbac & connector credentials --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
This commit is contained in:
parent
3ec3673d34
commit
94836cfad4
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@ -288,6 +288,9 @@ jobs:
|
|||||||
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
||||||
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
||||||
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
||||||
|
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
|
||||||
|
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
|
||||||
|
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
|
||||||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
||||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||||
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
||||||
|
@ -85,6 +85,9 @@ jobs:
|
|||||||
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
||||||
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
||||||
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
||||||
|
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
|
||||||
|
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
|
||||||
|
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
|
||||||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
||||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||||
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
||||||
|
@ -8,9 +8,11 @@
|
|||||||
### Features
|
### Features
|
||||||
* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
|
* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
|
||||||
|
|
||||||
|
* **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list items
|
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list item
|
||||||
* **Fixes duplicated elements** Fixes issue where elements are duplicated when embeddings are generated. This will allow users to generate embeddings for their list of Elements without duplicating/breaking the orginal content.
|
* **Fixes duplicated elements** Fixes issue where elements are duplicated when embeddings are generated. This will allow users to generate embeddings for their list of Elements without duplicating/breaking the orginal content.
|
||||||
* **Fixes failure when flagging for embeddings through unstructured-ingest** Currently adding the embedding parameter to any connector results in a failure on the copy stage. This is resolves the issue by adding the IngestDoc to the context map in the embedding node's `run` method. This allows users to specify that connectors fetch embeddings without failure.
|
* **Fixes failure when flagging for embeddings through unstructured-ingest** Currently adding the embedding parameter to any connector results in a failure on the copy stage. This is resolves the issue by adding the IngestDoc to the context map in the embedding node's `run` method. This allows users to specify that connectors fetch embeddings without failure.
|
||||||
* **Fix ingest pipeline reformat nodes not discoverable** Fixes issue where reformat nodes raise ModuleNotFoundError on import. This was due to the directory was missing `__init__.py` in order to make it discoverable.
|
* **Fix ingest pipeline reformat nodes not discoverable** Fixes issue where reformat nodes raise ModuleNotFoundError on import. This was due to the directory was missing `__init__.py` in order to make it discoverable.
|
||||||
|
@ -22,6 +22,9 @@ Run Locally
|
|||||||
--client-id "<Microsoft Sharepoint app client-id>" \
|
--client-id "<Microsoft Sharepoint app client-id>" \
|
||||||
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
||||||
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
||||||
|
--permissions-application-id "<Microsoft Graph API application id, to process per-file access permissions>" \
|
||||||
|
--permissions-client-cred "<Microsoft Graph API application credentials, to process per-file access permissions>" \
|
||||||
|
--permissions-tenant "<e.g https://contoso.onmicrosoft.com (tenant URL) to process per-file access permissions>" \
|
||||||
--files-only "Flag to process only files within the site(s)" \
|
--files-only "Flag to process only files within the site(s)" \
|
||||||
--output-dir sharepoint-ingest-output \
|
--output-dir sharepoint-ingest-output \
|
||||||
--num-processes 2 \
|
--num-processes 2 \
|
||||||
@ -46,6 +49,10 @@ Run Locally
|
|||||||
client_id="<Microsoft Sharepoint app client-id>",
|
client_id="<Microsoft Sharepoint app client-id>",
|
||||||
client_cred="<Microsoft Sharepoint app client-secret>",
|
client_cred="<Microsoft Sharepoint app client-secret>",
|
||||||
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
||||||
|
# Credentials to process data about permissions (rbac) within the tenant
|
||||||
|
permissions_application_id="<Microsoft Graph API application id>",
|
||||||
|
permissions_client_cred="<Microsoft Graph API application credentials>",
|
||||||
|
permissions_tenant="<e.g https://contoso.onmicrosoft.com to process permission info within tenant>",
|
||||||
# Flag to process only files within the site(s)
|
# Flag to process only files within the site(s)
|
||||||
files_only=True,
|
files_only=True,
|
||||||
path="Shared Documents",
|
path="Shared Documents",
|
||||||
@ -68,6 +75,9 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
|||||||
--client-id "<Microsoft Sharepoint app client-id>" \
|
--client-id "<Microsoft Sharepoint app client-id>" \
|
||||||
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
||||||
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
||||||
|
--permissions-application-id "<Microsoft Graph API application id, to process per-file access permissions>" \
|
||||||
|
--permissions-client-cred "<Microsoft Graph API application credentials, to process per-file access permissions>" \
|
||||||
|
--permissions-tenant "<e.g https://contoso.onmicrosoft.com (tenant URL) to process per-file access permissions>" \
|
||||||
--files-only "Flag to process only files within the site(s)" \
|
--files-only "Flag to process only files within the site(s)" \
|
||||||
--output-dir sharepoint-ingest-output \
|
--output-dir sharepoint-ingest-output \
|
||||||
--num-processes 2 \
|
--num-processes 2 \
|
||||||
@ -98,6 +108,10 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
|
|||||||
client_id="<Microsoft Sharepoint app client-id>",
|
client_id="<Microsoft Sharepoint app client-id>",
|
||||||
client_cred="<Microsoft Sharepoint app client-secret>",
|
client_cred="<Microsoft Sharepoint app client-secret>",
|
||||||
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
|
||||||
|
# Credentials to process data about permissions (rbac) within the tenant
|
||||||
|
permissions_application_id="<Microsoft Graph API application id>",
|
||||||
|
permissions_client_cred="<Microsoft Graph API application credentials>",
|
||||||
|
permissions_tenant="<e.g https://contoso.onmicrosoft.com to process permission info within tenant>",
|
||||||
# Flag to process only files within the site(s)
|
# Flag to process only files within the site(s)
|
||||||
files_only=True,
|
files_only=True,
|
||||||
path="Shared Documents",
|
path="Shared Documents",
|
||||||
|
5
examples/ingest/sharepoint/ingest.sh
Normal file → Executable file
5
examples/ingest/sharepoint/ingest.sh
Normal file → Executable file
@ -12,6 +12,8 @@
|
|||||||
# To get the credentials for your Sharepoint app, follow these steps:
|
# To get the credentials for your Sharepoint app, follow these steps:
|
||||||
# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
|
# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
|
||||||
|
|
||||||
|
# To optionally set up your application and obtain permissions related variables (--permissions-application-id, --permissions-client-cred, --permissions-tenant), follow these steps:
|
||||||
|
# https://tsmatz.wordpress.com/2016/10/07/application-permission-with-v2-endpoint-and-microsoft-graph
|
||||||
|
|
||||||
|
|
||||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
@ -22,6 +24,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--client-id "<Microsoft Sharepoint app client-id>" \
|
--client-id "<Microsoft Sharepoint app client-id>" \
|
||||||
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
--client-cred "<Microsoft Sharepoint app client-secret>" \
|
||||||
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
|
||||||
|
--permissions-application-id "<Microsoft Graph API application id to process per-file access permissions>" \
|
||||||
|
--permissions-client-cred "<Microsoft Graph API application credentials to process per-file access permissions>" \
|
||||||
|
--permissions-tenant "<e.g https://contoso.onmicrosoft.com to process per-file access permissions>" \
|
||||||
--files-only "Flag to process only files within the site(s)" \
|
--files-only "Flag to process only files within the site(s)" \
|
||||||
--output-dir sharepoint-ingest-output \
|
--output-dir sharepoint-ingest-output \
|
||||||
--num-processes 2 \
|
--num-processes 2 \
|
||||||
|
@ -46,6 +46,7 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
|||||||
elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then
|
elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then
|
||||||
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT"
|
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT"
|
||||||
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT"
|
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT"
|
||||||
|
"$SCRIPT_DIR"/clean-permissions-files.sh "$OUTPUT_DIR_TEXT"
|
||||||
diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt
|
diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt
|
||||||
cat outputdiff.txt
|
cat outputdiff.txt
|
||||||
diffstat -c outputdiff.txt
|
diffstat -c outputdiff.txt
|
||||||
|
27
test_unstructured_ingest/clean-permissions-files.sh
Executable file
27
test_unstructured_ingest/clean-permissions-files.sh
Executable file
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Description: Delete (cleanup) permissions files in a folder, so that they are not included in
|
||||||
|
# text diff tests.
|
||||||
|
#
|
||||||
|
# Arguments:
|
||||||
|
# - $1: Name of the folder to do the cleanup operation in.
|
||||||
|
|
||||||
|
set +e
|
||||||
|
if [ "$#" -ne 1 ]; then
|
||||||
|
echo "Please provide a folder to clean the files in: $0 <folder_path>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
folder_path="$1"
|
||||||
|
if [ ! -d "$folder_path" ]; then
|
||||||
|
echo "'$folder_path' is not a directory. Please provide a folder / directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for file in "$folder_path"/*_SEP_*; do
|
||||||
|
if [ -e "$file" ]; then
|
||||||
|
rm "$file"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Completed cleanup for permissions files"
|
@ -11,7 +11,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:55",
|
"date_created": "2023-06-16T05:04:55",
|
||||||
"date_modified": "2023-06-16T05:04:55"
|
"date_modified": "2023-06-16T05:04:55",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "fake-text.txt",
|
"filename": "fake-text.txt",
|
||||||
"filetype": "text/plain",
|
"filetype": "text/plain",
|
||||||
@ -33,7 +121,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:55",
|
"date_created": "2023-06-16T05:04:55",
|
||||||
"date_modified": "2023-06-16T05:04:55"
|
"date_modified": "2023-06-16T05:04:55",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "fake-text.txt",
|
"filename": "fake-text.txt",
|
||||||
"filetype": "text/plain",
|
"filetype": "text/plain",
|
||||||
@ -55,7 +231,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:55",
|
"date_created": "2023-06-16T05:04:55",
|
||||||
"date_modified": "2023-06-16T05:04:55"
|
"date_modified": "2023-06-16T05:04:55",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "fake-text.txt",
|
"filename": "fake-text.txt",
|
||||||
"filetype": "text/plain",
|
"filetype": "text/plain",
|
||||||
@ -77,7 +341,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:55",
|
"date_created": "2023-06-16T05:04:55",
|
||||||
"date_modified": "2023-06-16T05:04:55"
|
"date_modified": "2023-06-16T05:04:55",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "fake-text.txt",
|
"filename": "fake-text.txt",
|
||||||
"filetype": "text/plain",
|
"filetype": "text/plain",
|
||||||
@ -99,7 +451,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:55",
|
"date_created": "2023-06-16T05:04:55",
|
||||||
"date_modified": "2023-06-16T05:04:55"
|
"date_modified": "2023-06-16T05:04:55",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "fake-text.txt",
|
"filename": "fake-text.txt",
|
||||||
"filetype": "text/plain",
|
"filetype": "text/plain",
|
||||||
@ -121,7 +561,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:55",
|
"date_created": "2023-06-16T05:04:55",
|
||||||
"date_modified": "2023-06-16T05:04:55"
|
"date_modified": "2023-06-16T05:04:55",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "fake-text.txt",
|
"filename": "fake-text.txt",
|
||||||
"filetype": "text/plain",
|
"filetype": "text/plain",
|
||||||
|
@ -11,7 +11,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:04:47",
|
"date_created": "2023-06-16T05:04:47",
|
||||||
"date_modified": "2023-06-16T05:04:47"
|
"date_modified": "2023-06-16T05:04:47",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "ideas-page.html",
|
"filename": "ideas-page.html",
|
||||||
"filetype": "text/html",
|
"filetype": "text/html",
|
||||||
|
@ -11,7 +11,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:05:05",
|
"date_created": "2023-06-16T05:05:05",
|
||||||
"date_modified": "2023-06-16T05:05:05"
|
"date_modified": "2023-06-16T05:05:05",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "stanley-cups.xlsx",
|
"filename": "stanley-cups.xlsx",
|
||||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
@ -36,7 +124,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:05:05",
|
"date_created": "2023-06-16T05:05:05",
|
||||||
"date_modified": "2023-06-16T05:05:05"
|
"date_modified": "2023-06-16T05:05:05",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "stanley-cups.xlsx",
|
"filename": "stanley-cups.xlsx",
|
||||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
@ -61,7 +237,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:05:05",
|
"date_created": "2023-06-16T05:05:05",
|
||||||
"date_modified": "2023-06-16T05:05:05"
|
"date_modified": "2023-06-16T05:05:05",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "stanley-cups.xlsx",
|
"filename": "stanley-cups.xlsx",
|
||||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
@ -86,7 +350,95 @@
|
|||||||
"site_url": "https://unstructuredio.sharepoint.com"
|
"site_url": "https://unstructuredio.sharepoint.com"
|
||||||
},
|
},
|
||||||
"date_created": "2023-06-16T05:05:05",
|
"date_created": "2023-06-16T05:05:05",
|
||||||
"date_modified": "2023-06-16T05:05:05"
|
"date_modified": "2023-06-16T05:05:05",
|
||||||
|
"permissions_data": [
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Owners",
|
||||||
|
"id": "3",
|
||||||
|
"loginName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Owners"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"roles": [
|
||||||
|
"read"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Visitors",
|
||||||
|
"id": "4",
|
||||||
|
"loginName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Visitors"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"roles": [
|
||||||
|
"write"
|
||||||
|
],
|
||||||
|
"shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
|
||||||
|
"grantedToV2": {
|
||||||
|
"siteGroup": {
|
||||||
|
"displayName": "Communication site Members",
|
||||||
|
"id": "5",
|
||||||
|
"loginName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Communication site Members"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"roles": [
|
||||||
|
"owner"
|
||||||
|
],
|
||||||
|
"shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
|
||||||
|
"grantedToV2": {
|
||||||
|
"group": {
|
||||||
|
"@odata.type": "#microsoft.graph.sharePointIdentity",
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
},
|
||||||
|
"siteUser": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "7",
|
||||||
|
"loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"grantedTo": {
|
||||||
|
"user": {
|
||||||
|
"displayName": "Global Administrator",
|
||||||
|
"id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"inheritedFrom": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"filename": "stanley-cups.xlsx",
|
"filename": "stanley-cups.xlsx",
|
||||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
@ -19,6 +19,11 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
|
||||||
|
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -z "$OPENAI_API_KEY" ]; then
|
if [ -z "$OPENAI_API_KEY" ]; then
|
||||||
echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set."
|
echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -85,6 +90,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--client-cred "$SHAREPOINT_CRED" \
|
--client-cred "$SHAREPOINT_CRED" \
|
||||||
--client-id "$SHAREPOINT_CLIENT_ID" \
|
--client-id "$SHAREPOINT_CLIENT_ID" \
|
||||||
--site "$SHAREPOINT_SITE" \
|
--site "$SHAREPOINT_SITE" \
|
||||||
|
--permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \
|
||||||
|
--permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \
|
||||||
|
--permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \
|
||||||
--path "Shared Documents" \
|
--path "Shared Documents" \
|
||||||
--recursive \
|
--recursive \
|
||||||
--embedding-api-key "$OPENAI_API_KEY" \
|
--embedding-api-key "$OPENAI_API_KEY" \
|
||||||
|
@ -26,6 +26,12 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
|
|||||||
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
|
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
|
||||||
|
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly
|
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
sharepoint \
|
sharepoint \
|
||||||
@ -40,6 +46,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--client-cred "$SHAREPOINT_CRED" \
|
--client-cred "$SHAREPOINT_CRED" \
|
||||||
--client-id "$SHAREPOINT_CLIENT_ID" \
|
--client-id "$SHAREPOINT_CLIENT_ID" \
|
||||||
--site "$SHAREPOINT_SITE" \
|
--site "$SHAREPOINT_SITE" \
|
||||||
|
--permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \
|
||||||
|
--permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \
|
||||||
|
--permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \
|
||||||
--path "Shared Documents" \
|
--path "Shared Documents" \
|
||||||
--recursive \
|
--recursive \
|
||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
@ -45,6 +45,7 @@ class DataSourceMetadata:
|
|||||||
date_created: Optional[str] = None
|
date_created: Optional[str] = None
|
||||||
date_modified: Optional[str] = None
|
date_modified: Optional[str] = None
|
||||||
date_processed: Optional[str] = None
|
date_processed: Optional[str] = None
|
||||||
|
permissions_data: Optional[List[Dict[str, Any]]] = None
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
return {key: value for key, value in self.__dict__.items() if value is not None}
|
return {key: value for key, value in self.__dict__.items() if value is not None}
|
||||||
|
@ -11,6 +11,7 @@ from unstructured.ingest.interfaces import (
|
|||||||
ChunkingConfig,
|
ChunkingConfig,
|
||||||
EmbeddingConfig,
|
EmbeddingConfig,
|
||||||
PartitionConfig,
|
PartitionConfig,
|
||||||
|
PermissionsConfig,
|
||||||
ProcessorConfig,
|
ProcessorConfig,
|
||||||
ReadConfig,
|
ReadConfig,
|
||||||
)
|
)
|
||||||
@ -287,12 +288,12 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
|
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
|
||||||
This allows CLI arguments to be prepended with chunk_ during CLI invocation but
|
This allows CLI arguments to be prepended with embedding_ during CLI invocation but
|
||||||
doesn't require that as part of the field names in this class
|
doesn't require that as part of the field names in this class
|
||||||
"""
|
"""
|
||||||
if isinstance(kvs, dict):
|
if isinstance(kvs, dict):
|
||||||
new_kvs = {
|
new_kvs = {
|
||||||
k[len("embedding-") :]: v # noqa: E203
|
k[len("embedding_") :]: v # noqa: E203
|
||||||
for k, v in kvs.items()
|
for k, v in kvs.items()
|
||||||
if k.startswith("embedding_")
|
if k.startswith("embedding_")
|
||||||
}
|
}
|
||||||
@ -363,3 +364,75 @@ class CliChunkingConfig(ChunkingConfig, CliMixin):
|
|||||||
return None
|
return None
|
||||||
return _decode_dataclass(cls, new_kvs, infer_missing)
|
return _decode_dataclass(cls, new_kvs, infer_missing)
|
||||||
return _decode_dataclass(cls, kvs, infer_missing)
|
return _decode_dataclass(cls, kvs, infer_missing)
|
||||||
|
|
||||||
|
|
||||||
|
class CliPermissionsConfig(PermissionsConfig, CliMixin):
|
||||||
|
@staticmethod
|
||||||
|
def add_cli_options(cmd: click.Command) -> None:
|
||||||
|
options = [
|
||||||
|
click.Option(
|
||||||
|
["--permissions-application-id"],
|
||||||
|
type=str,
|
||||||
|
help="Microsoft Graph API application id",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--permissions-client-cred"],
|
||||||
|
type=str,
|
||||||
|
help="Microsoft Graph API application credentials",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--permissions-tenant"],
|
||||||
|
type=str,
|
||||||
|
help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
cmd.params.extend(options)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(
|
||||||
|
cls,
|
||||||
|
kvs: Json,
|
||||||
|
*,
|
||||||
|
infer_missing=False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
|
||||||
|
This allows CLI arguments to be prepended with permissions_ during CLI invocation but
|
||||||
|
doesn't require that as part of the field names in this class. It also checks if the
|
||||||
|
CLI params are provided as intended.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if (
|
||||||
|
isinstance(kvs, dict)
|
||||||
|
and any(
|
||||||
|
[
|
||||||
|
kvs["permissions_application_id"]
|
||||||
|
or kvs["permissions_client_cred"]
|
||||||
|
or kvs["permissions_tenant"],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
and not all(
|
||||||
|
[
|
||||||
|
kvs["permissions_application_id"]
|
||||||
|
and kvs["permissions_client_cred"]
|
||||||
|
and kvs["permissions_tenant"],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"Please provide either none or all of the following optional values:\n"
|
||||||
|
"--permissions-application-id\n"
|
||||||
|
"--permissions-client-cred\n"
|
||||||
|
"--permissions-tenant",
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(kvs, dict):
|
||||||
|
new_kvs = {
|
||||||
|
k[len("permissions_") :]: v # noqa: E203
|
||||||
|
for k, v in kvs.items()
|
||||||
|
if k.startswith("permissions_")
|
||||||
|
}
|
||||||
|
if len(new_kvs.keys()) == 0:
|
||||||
|
return None
|
||||||
|
return _decode_dataclass(cls, new_kvs, infer_missing)
|
||||||
|
return _decode_dataclass(cls, kvs, infer_missing)
|
||||||
|
@ -8,6 +8,7 @@ from unstructured.ingest.cli.interfaces import (
|
|||||||
CliEmbeddingConfig,
|
CliEmbeddingConfig,
|
||||||
CliMixin,
|
CliMixin,
|
||||||
CliPartitionConfig,
|
CliPartitionConfig,
|
||||||
|
CliPermissionsConfig,
|
||||||
CliProcessorConfig,
|
CliProcessorConfig,
|
||||||
CliReadConfig,
|
CliReadConfig,
|
||||||
)
|
)
|
||||||
@ -39,6 +40,7 @@ def extract_configs(
|
|||||||
"embedding_config": CliEmbeddingConfig.from_dict(data),
|
"embedding_config": CliEmbeddingConfig.from_dict(data),
|
||||||
"chunking_config": CliChunkingConfig.from_dict(data),
|
"chunking_config": CliChunkingConfig.from_dict(data),
|
||||||
"processor_config": CliProcessorConfig.from_dict(data),
|
"processor_config": CliProcessorConfig.from_dict(data),
|
||||||
|
"permissions_config": CliPermissionsConfig.from_dict(data),
|
||||||
}
|
}
|
||||||
for v in validate:
|
for v in validate:
|
||||||
v.from_dict(data)
|
v.from_dict(data)
|
||||||
@ -52,6 +54,7 @@ def add_options(cmd: click.Command, extras=t.List[t.Type[CliMixin]]) -> click.Co
|
|||||||
CliEmbeddingConfig,
|
CliEmbeddingConfig,
|
||||||
CliChunkingConfig,
|
CliChunkingConfig,
|
||||||
CliProcessorConfig,
|
CliProcessorConfig,
|
||||||
|
CliPermissionsConfig,
|
||||||
]
|
]
|
||||||
configs.extend(extras)
|
configs.extend(extras)
|
||||||
for config in configs:
|
for config in configs:
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
import typing as t
|
import typing as t
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -15,6 +17,7 @@ from unstructured.ingest.interfaces import (
|
|||||||
SourceConnectorCleanupMixin,
|
SourceConnectorCleanupMixin,
|
||||||
SourceMetadata,
|
SourceMetadata,
|
||||||
)
|
)
|
||||||
|
from unstructured.ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig
|
||||||
from unstructured.ingest.logger import logger
|
from unstructured.ingest.logger import logger
|
||||||
from unstructured.utils import requires_dependencies
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
@ -35,6 +38,7 @@ class SimpleSharepointConfig(BaseConnectorConfig):
|
|||||||
path: str
|
path: str
|
||||||
process_pages: bool = False
|
process_pages: bool = False
|
||||||
recursive: bool = False
|
recursive: bool = False
|
||||||
|
permissions_config: t.Optional[SharepointPermissionsConfig] = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if not (self.client_id and self.client_credential and self.site_url):
|
if not (self.client_id and self.client_credential and self.site_url):
|
||||||
@ -57,6 +61,14 @@ class SimpleSharepointConfig(BaseConnectorConfig):
|
|||||||
raise
|
raise
|
||||||
return site_client
|
return site_client
|
||||||
|
|
||||||
|
def get_permissions_client(self):
|
||||||
|
try:
|
||||||
|
permissions_connector = SharepointPermissionsConnector(self.permissions_config)
|
||||||
|
assert permissions_connector.access_token
|
||||||
|
return permissions_connector
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||||
@ -122,7 +134,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
|||||||
file = site_client.web.get_file_by_server_relative_url(self.server_path)
|
file = site_client.web.get_file_by_server_relative_url(self.server_path)
|
||||||
if properties_only:
|
if properties_only:
|
||||||
file = file.get().execute_query()
|
file = file.get().execute_query()
|
||||||
|
|
||||||
except ClientRequestException as e:
|
except ClientRequestException as e:
|
||||||
if e.response.status_code == 404:
|
if e.response.status_code == 404:
|
||||||
return None
|
return None
|
||||||
@ -144,6 +155,44 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
|||||||
return None
|
return None
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def update_permissions_data(self):
|
||||||
|
def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath):
|
||||||
|
permissions_filename = permissions_filename.split("_SEP_")
|
||||||
|
ingest_doc_filepath = ingest_doc_filepath.split("/")
|
||||||
|
|
||||||
|
if parent_type == "sites":
|
||||||
|
return permissions_filename[0] == ingest_doc_filepath[1]
|
||||||
|
|
||||||
|
elif parent_type == "SitePages" or parent_type == "Shared Documents":
|
||||||
|
return True
|
||||||
|
|
||||||
|
permissions_data = None
|
||||||
|
permissions_dir = Path(self.processor_config.output_dir) / "permissions_data"
|
||||||
|
|
||||||
|
if permissions_dir.is_dir():
|
||||||
|
parent_type = self.file_path.split("/")[0]
|
||||||
|
|
||||||
|
if parent_type == "sites":
|
||||||
|
read_dir = permissions_dir / "sites"
|
||||||
|
elif parent_type == "SitePages" or parent_type == "Shared Documents":
|
||||||
|
read_dir = permissions_dir / "other"
|
||||||
|
else:
|
||||||
|
read_dir = permissions_dir / "other"
|
||||||
|
|
||||||
|
for filename in os.listdir(read_dir):
|
||||||
|
permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1]
|
||||||
|
ingestdoc_docname = self.file_path.split("/")[-1]
|
||||||
|
|
||||||
|
if ingestdoc_docname == permissions_docname and parent_name_matches(
|
||||||
|
parent_type=parent_type,
|
||||||
|
permissions_filename=filename,
|
||||||
|
ingest_doc_filepath=self.file_path,
|
||||||
|
):
|
||||||
|
with open(read_dir / filename) as f:
|
||||||
|
permissions_data = json.loads(f.read())
|
||||||
|
|
||||||
|
return permissions_data
|
||||||
|
|
||||||
def update_source_metadata(self, **kwargs):
|
def update_source_metadata(self, **kwargs):
|
||||||
if self.is_page:
|
if self.is_page:
|
||||||
page = self._fetch_page()
|
page = self._fetch_page()
|
||||||
@ -158,6 +207,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
|||||||
version=page.get_property("Version", ""),
|
version=page.get_property("Version", ""),
|
||||||
source_url=page.absolute_url,
|
source_url=page.absolute_url,
|
||||||
exists=True,
|
exists=True,
|
||||||
|
permissions_data=self.update_permissions_data()
|
||||||
|
if self.connector_config.permissions_config
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -176,6 +228,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
|||||||
version=file.major_version,
|
version=file.major_version,
|
||||||
source_url=file.properties.get("LinkingUrl", None),
|
source_url=file.properties.get("LinkingUrl", None),
|
||||||
exists=True,
|
exists=True,
|
||||||
|
permissions_data=self.update_permissions_data()
|
||||||
|
if self.connector_config.permissions_config
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _download_page(self):
|
def _download_page(self):
|
||||||
@ -317,6 +372,12 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
|||||||
|
|
||||||
def get_ingest_docs(self):
|
def get_ingest_docs(self):
|
||||||
base_site_client = self.connector_config.get_site_client()
|
base_site_client = self.connector_config.get_site_client()
|
||||||
|
|
||||||
|
if self.connector_config.permissions_config:
|
||||||
|
permissions_client = self.connector_config.get_permissions_client()
|
||||||
|
if permissions_client:
|
||||||
|
permissions_client.write_all_permissions(self.processor_config.output_dir)
|
||||||
|
|
||||||
if not base_site_client.is_tenant:
|
if not base_site_client.is_tenant:
|
||||||
return self._ingest_site_docs(base_site_client)
|
return self._ingest_site_docs(base_site_client)
|
||||||
tenant = base_site_client.tenant
|
tenant = base_site_client.tenant
|
||||||
@ -328,3 +389,166 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
|||||||
site_client = self.connector_config.get_site_client(site_url)
|
site_client = self.connector_config.get_site_client(site_url)
|
||||||
ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
|
ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
|
||||||
return ingest_docs
|
return ingest_docs
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SharepointPermissionsConnector:
|
||||||
|
def __init__(self, permissions_config):
|
||||||
|
self.permissions_config: SharepointPermissionsConfig = permissions_config
|
||||||
|
self.initialize()
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
self.access_token: str = self.get_access_token()
|
||||||
|
|
||||||
|
@requires_dependencies(["requests"], extras="sharepoint")
|
||||||
|
def get_access_token(self) -> str:
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = (
|
||||||
|
f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token"
|
||||||
|
)
|
||||||
|
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||||
|
data = {
|
||||||
|
"client_id": self.permissions_config.application_id,
|
||||||
|
"scope": "https://graph.microsoft.com/.default",
|
||||||
|
"client_secret": self.permissions_config.client_cred,
|
||||||
|
"grant_type": "client_credentials",
|
||||||
|
}
|
||||||
|
response = requests.post(url, headers=headers, data=data)
|
||||||
|
return response.json()["access_token"]
|
||||||
|
|
||||||
|
def validated_response(self, response):
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
else:
|
||||||
|
print(f"Request failed with status code {response.status_code}:")
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
@requires_dependencies(["requests"], extras="sharepoint")
|
||||||
|
def get_sites(self):
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = "https://graph.microsoft.com/v1.0/sites"
|
||||||
|
params = {
|
||||||
|
"$select": "webUrl, id",
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.access_token}",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, params=params, headers=headers)
|
||||||
|
return self.validated_response(response)
|
||||||
|
|
||||||
|
@requires_dependencies(["requests"], extras="sharepoint")
|
||||||
|
def get_drives(self, site):
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.access_token}",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
return self.validated_response(response)
|
||||||
|
|
||||||
|
@requires_dependencies(["requests"], extras="sharepoint")
|
||||||
|
def get_drive_items(self, site, drive_id):
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.access_token}",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
return self.validated_response(response)
|
||||||
|
|
||||||
|
def extract_site_name_from_weburl(self, weburl):
|
||||||
|
split_path = urlparse(weburl).path.lstrip("/").split("/")
|
||||||
|
|
||||||
|
if split_path[0] == "sites":
|
||||||
|
return "sites", split_path[1]
|
||||||
|
|
||||||
|
elif split_path[0] == "Shared%20Documents":
|
||||||
|
return "Shared Documents", "Shared Documents"
|
||||||
|
|
||||||
|
elif split_path[0] == "personal":
|
||||||
|
return "Personal", "Personal"
|
||||||
|
|
||||||
|
elif split_path[0] == "_layouts":
|
||||||
|
return "layouts", "layouts"
|
||||||
|
|
||||||
|
# if other weburl structures are found, additional logic might need to be implemented
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
"""Couldn't extract sitename, unknown site or parent type. Skipping permissions
|
||||||
|
ingestion for the document with the URL:""",
|
||||||
|
weburl,
|
||||||
|
)
|
||||||
|
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
@requires_dependencies(["requests"], extras="sharepoint")
|
||||||
|
def get_permissions_for_drive_item(self, site, drive_id, item_id):
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = f"https://graph.microsoft.com/v1.0/sites/ \
|
||||||
|
{site}/drives/{drive_id}/items/{item_id}/permissions"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.access_token}",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
return self.validated_response(response)
|
||||||
|
|
||||||
|
def write_all_permissions(self, output_dir):
|
||||||
|
sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]]
|
||||||
|
drive_ids = []
|
||||||
|
|
||||||
|
print("Obtaining drive data for sites for permissions (rbac)")
|
||||||
|
for site_id, site_url in sites:
|
||||||
|
drives = self.get_drives(site_id)
|
||||||
|
if drives:
|
||||||
|
drives_for_site = drives["value"]
|
||||||
|
drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site])
|
||||||
|
|
||||||
|
print("Obtaining item data from drives for permissions (rbac)")
|
||||||
|
item_ids = []
|
||||||
|
for site, drive_id in drive_ids:
|
||||||
|
drive_items = self.get_drive_items(site, drive_id)
|
||||||
|
if drive_items:
|
||||||
|
item_ids.extend(
|
||||||
|
[
|
||||||
|
(site, drive_id, item["id"], item["name"], item["webUrl"])
|
||||||
|
for item in drive_items["value"]
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
permissions_dir = Path(output_dir) / "permissions_data"
|
||||||
|
|
||||||
|
print("Writing permissions data to disk")
|
||||||
|
for site, drive_id, item_id, item_name, item_web_url in item_ids:
|
||||||
|
res = self.get_permissions_for_drive_item(site, drive_id, item_id)
|
||||||
|
if res:
|
||||||
|
parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url)
|
||||||
|
|
||||||
|
if parent_type == "sites":
|
||||||
|
write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json"
|
||||||
|
|
||||||
|
elif parent_type == "Personal" or parent_type == "Shared Documents":
|
||||||
|
write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
|
||||||
|
else:
|
||||||
|
write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
|
||||||
|
|
||||||
|
if not Path(os.path.dirname(write_path)).is_dir():
|
||||||
|
os.makedirs(os.path.dirname(write_path))
|
||||||
|
|
||||||
|
with open(write_path, "w") as f:
|
||||||
|
json.dump(res["value"], f)
|
||||||
|
@ -107,6 +107,14 @@ class ChunkingConfig(BaseConfig):
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PermissionsConfig(BaseConfig):
|
||||||
|
application_id: t.Optional[str]
|
||||||
|
client_cred: t.Optional[str]
|
||||||
|
tenant: t.Optional[str]
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class WriteConfig(BaseConfig):
|
class WriteConfig(BaseConfig):
|
||||||
pass
|
pass
|
||||||
@ -123,6 +131,7 @@ class SourceMetadata(DataClassJsonMixin, ABC):
|
|||||||
version: t.Optional[str] = None
|
version: t.Optional[str] = None
|
||||||
source_url: t.Optional[str] = None
|
source_url: t.Optional[str] = None
|
||||||
exists: t.Optional[bool] = None
|
exists: t.Optional[bool] = None
|
||||||
|
permissions_data: t.Optional[t.List[t.Dict[str, t.Any]]] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -212,6 +221,13 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
|
|||||||
the version of the document."""
|
the version of the document."""
|
||||||
return self.source_metadata.version # type: ignore
|
return self.source_metadata.version # type: ignore
|
||||||
|
|
||||||
|
@property
|
||||||
|
def permissions_data(self) -> t.Optional[t.List[t.Dict[str, t.Any]]]:
|
||||||
|
"""Access control data, aka permissions or sharing, from the source system."""
|
||||||
|
if self.source_metadata is None:
|
||||||
|
self.update_source_metadata()
|
||||||
|
return self.source_metadata.permissions_data # type: ignore
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def cleanup_file(self):
|
def cleanup_file(self):
|
||||||
"""Removes the local copy the file (or anything else) after successful processing."""
|
"""Removes the local copy the file (or anything else) after successful processing."""
|
||||||
@ -240,6 +256,12 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
|
|||||||
"""Sets the SourceMetadata and the properties for the doc"""
|
"""Sets the SourceMetadata and the properties for the doc"""
|
||||||
self._source_metadata = SourceMetadata()
|
self._source_metadata = SourceMetadata()
|
||||||
|
|
||||||
|
def update_permissions_data(self):
|
||||||
|
"""Sets the _permissions_data property for the doc.
|
||||||
|
This property is later used to fill the corresponding SourceMetadata.permissions_data field,
|
||||||
|
and after that carries on to the permissions_data property."""
|
||||||
|
self._permissions_data: t.Optional[t.List[t.Dict]] = None
|
||||||
|
|
||||||
# NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
|
# NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
|
||||||
# in addition to or instead of get_file()
|
# in addition to or instead of get_file()
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -269,6 +291,7 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
|
|||||||
date_created=self.date_created,
|
date_created=self.date_created,
|
||||||
date_modified=self.date_modified,
|
date_modified=self.date_modified,
|
||||||
date_processed=self.date_processed,
|
date_processed=self.date_processed,
|
||||||
|
permissions_data=self.permissions_data,
|
||||||
),
|
),
|
||||||
**partition_kwargs,
|
**partition_kwargs,
|
||||||
)
|
)
|
||||||
@ -420,6 +443,38 @@ class SourceConnectorCleanupMixin:
|
|||||||
os.rmdir(cur_dir)
|
os.rmdir(cur_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class PermissionsCleanupMixin:
|
||||||
|
processor_config: ProcessorConfig
|
||||||
|
|
||||||
|
def cleanup_permissions(self, cur_dir=None):
|
||||||
|
def has_no_folders(folder_path):
|
||||||
|
folders = [
|
||||||
|
item
|
||||||
|
for item in os.listdir(folder_path)
|
||||||
|
if os.path.isdir(os.path.join(folder_path, item))
|
||||||
|
]
|
||||||
|
return len(folders) == 0
|
||||||
|
|
||||||
|
"""Recursively clean up downloaded files and directories."""
|
||||||
|
if cur_dir is None:
|
||||||
|
cur_dir = Path(self.processor_config.output_dir, "permissions_data")
|
||||||
|
if cur_dir is None:
|
||||||
|
return
|
||||||
|
if Path(cur_dir).is_file():
|
||||||
|
cur_file = cur_dir
|
||||||
|
os.remove(cur_file)
|
||||||
|
return
|
||||||
|
sub_dirs = os.listdir(cur_dir)
|
||||||
|
os.chdir(cur_dir)
|
||||||
|
for sub_dir in sub_dirs:
|
||||||
|
# don't traverse symlinks, not that there every should be any
|
||||||
|
if not os.path.islink(sub_dir):
|
||||||
|
self.cleanup_permissions(sub_dir)
|
||||||
|
os.chdir("..")
|
||||||
|
if has_no_folders(cur_dir):
|
||||||
|
os.rmdir(cur_dir)
|
||||||
|
|
||||||
|
|
||||||
class IngestDocCleanupMixin:
|
class IngestDocCleanupMixin:
|
||||||
read_config: ReadConfig
|
read_config: ReadConfig
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from .doc_factory import DocFactory
|
from .doc_factory import DocFactory
|
||||||
from .interfaces import PipelineContext, ReformatNode
|
from .interfaces import PipelineContext, ReformatNode
|
||||||
from .partition import Partitioner
|
from .partition import Partitioner
|
||||||
|
from .permissions import PermissionsDataCleaner
|
||||||
from .pipeline import Pipeline
|
from .pipeline import Pipeline
|
||||||
from .reformat.chunking import Chunker
|
from .reformat.chunking import Chunker
|
||||||
from .reformat.embedding import Embedder
|
from .reformat.embedding import Embedder
|
||||||
@ -17,4 +18,5 @@ __all__ = [
|
|||||||
"Writer",
|
"Writer",
|
||||||
"Chunker",
|
"Chunker",
|
||||||
"ReformatNode",
|
"ReformatNode",
|
||||||
|
"PermissionsDataCleaner",
|
||||||
]
|
]
|
||||||
|
@ -212,3 +212,18 @@ class CopyNode(PipelineNode):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def run(self, json_path: str):
|
def run(self, json_path: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PermissionsNode(PipelineNode):
|
||||||
|
"""
|
||||||
|
Encapsulated logic to do operations on permissions related data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
logger.info("Running permissions node to cleanup the permissions folder")
|
||||||
|
super().initialize()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self):
|
||||||
|
pass
|
||||||
|
12
unstructured/ingest/pipeline/permissions.py
Normal file
12
unstructured/ingest/pipeline/permissions.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from unstructured.ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
|
||||||
|
from unstructured.ingest.pipeline.interfaces import PermissionsNode
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
|
||||||
|
processor_config: ProcessorConfig
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.cleanup_permissions()
|
@ -15,6 +15,7 @@ from unstructured.ingest.pipeline.interfaces import (
|
|||||||
SourceNode,
|
SourceNode,
|
||||||
WriteNode,
|
WriteNode,
|
||||||
)
|
)
|
||||||
|
from unstructured.ingest.pipeline.permissions import PermissionsDataCleaner
|
||||||
from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
|
from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
|
||||||
|
|
||||||
|
|
||||||
@ -26,6 +27,7 @@ class Pipeline(DataClassJsonMixin):
|
|||||||
partition_node: PartitionNode
|
partition_node: PartitionNode
|
||||||
write_node: t.Optional[WriteNode] = None
|
write_node: t.Optional[WriteNode] = None
|
||||||
reformat_nodes: t.List[ReformatNode] = field(default_factory=list)
|
reformat_nodes: t.List[ReformatNode] = field(default_factory=list)
|
||||||
|
permissions_node: t.Optional[PermissionsDataCleaner] = None
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
||||||
@ -79,3 +81,6 @@ class Pipeline(DataClassJsonMixin):
|
|||||||
|
|
||||||
if self.write_node:
|
if self.write_node:
|
||||||
self.write_node(iterable=partitioned_jsons)
|
self.write_node(iterable=partitioned_jsons)
|
||||||
|
|
||||||
|
if self.permissions_node:
|
||||||
|
self.permissions_node.cleanup_permissions()
|
||||||
|
@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import (
|
|||||||
ChunkingConfig,
|
ChunkingConfig,
|
||||||
EmbeddingConfig,
|
EmbeddingConfig,
|
||||||
PartitionConfig,
|
PartitionConfig,
|
||||||
|
PermissionsConfig,
|
||||||
ProcessorConfig,
|
ProcessorConfig,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.pipeline import (
|
from unstructured.ingest.pipeline import (
|
||||||
@ -15,6 +16,7 @@ from unstructured.ingest.pipeline import (
|
|||||||
DocFactory,
|
DocFactory,
|
||||||
Embedder,
|
Embedder,
|
||||||
Partitioner,
|
Partitioner,
|
||||||
|
PermissionsDataCleaner,
|
||||||
Pipeline,
|
Pipeline,
|
||||||
PipelineContext,
|
PipelineContext,
|
||||||
Reader,
|
Reader,
|
||||||
@ -33,6 +35,7 @@ def process_documents(
|
|||||||
dest_doc_connector: t.Optional[BaseDestinationConnector] = None,
|
dest_doc_connector: t.Optional[BaseDestinationConnector] = None,
|
||||||
chunking_config: t.Optional[ChunkingConfig] = None,
|
chunking_config: t.Optional[ChunkingConfig] = None,
|
||||||
embedder_config: t.Optional[EmbeddingConfig] = None,
|
embedder_config: t.Optional[EmbeddingConfig] = None,
|
||||||
|
permissions_config: t.Optional[PermissionsConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
|
pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
|
||||||
doc_factory = DocFactory(
|
doc_factory = DocFactory(
|
||||||
@ -64,6 +67,11 @@ def process_documents(
|
|||||||
if dest_doc_connector
|
if dest_doc_connector
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
permissions_data_cleaner = (
|
||||||
|
PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config)
|
||||||
|
if permissions_config
|
||||||
|
else None
|
||||||
|
)
|
||||||
pipeline = Pipeline(
|
pipeline = Pipeline(
|
||||||
pipeline_context=pipeline_config,
|
pipeline_context=pipeline_config,
|
||||||
doc_factory_node=doc_factory,
|
doc_factory_node=doc_factory,
|
||||||
@ -71,5 +79,6 @@ def process_documents(
|
|||||||
partition_node=partitioner,
|
partition_node=partitioner,
|
||||||
reformat_nodes=reformat_nodes,
|
reformat_nodes=reformat_nodes,
|
||||||
write_node=writer,
|
write_node=writer,
|
||||||
|
permissions_node=permissions_data_cleaner,
|
||||||
)
|
)
|
||||||
pipeline.run()
|
pipeline.run()
|
||||||
|
@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import (
|
|||||||
ChunkingConfig,
|
ChunkingConfig,
|
||||||
EmbeddingConfig,
|
EmbeddingConfig,
|
||||||
PartitionConfig,
|
PartitionConfig,
|
||||||
|
PermissionsConfig,
|
||||||
ProcessorConfig,
|
ProcessorConfig,
|
||||||
ReadConfig,
|
ReadConfig,
|
||||||
)
|
)
|
||||||
@ -24,6 +25,7 @@ class Runner(ABC):
|
|||||||
writer_kwargs: t.Optional[dict] = None
|
writer_kwargs: t.Optional[dict] = None
|
||||||
embedding_config: t.Optional[EmbeddingConfig] = None
|
embedding_config: t.Optional[EmbeddingConfig] = None
|
||||||
chunking_config: t.Optional[ChunkingConfig] = None
|
chunking_config: t.Optional[ChunkingConfig] = None
|
||||||
|
permissions_config: t.Optional[PermissionsConfig] = None
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def run(self, *args, **kwargs):
|
def run(self, *args, **kwargs):
|
||||||
@ -36,6 +38,18 @@ class Runner(ABC):
|
|||||||
return writer(**writer_kwargs)
|
return writer(**writer_kwargs)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_permissions_config(self) -> t.Optional[PermissionsConfig]:
|
||||||
|
if self.permissions_config is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
permissions_config_filled = bool(
|
||||||
|
self.permissions_config.application_id
|
||||||
|
and self.permissions_config.client_cred
|
||||||
|
and self.permissions_config.tenant,
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.permissions_config if permissions_config_filled else None
|
||||||
|
|
||||||
def process_documents(self, source_doc_connector: BaseSourceConnector):
|
def process_documents(self, source_doc_connector: BaseSourceConnector):
|
||||||
process_documents(
|
process_documents(
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
@ -44,4 +58,5 @@ class Runner(ABC):
|
|||||||
dest_doc_connector=self.get_dest_doc_connector(),
|
dest_doc_connector=self.get_dest_doc_connector(),
|
||||||
embedder_config=self.embedding_config,
|
embedder_config=self.embedding_config,
|
||||||
chunking_config=self.chunking_config,
|
chunking_config=self.chunking_config,
|
||||||
|
permissions_config=self.get_permissions_config(),
|
||||||
)
|
)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
|
import typing as t
|
||||||
|
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner.base_runner import Runner
|
from unstructured.ingest.runner.base_runner import Runner
|
||||||
@ -12,6 +13,9 @@ class SharePointRunner(Runner):
|
|||||||
site: str,
|
site: str,
|
||||||
client_id: str,
|
client_id: str,
|
||||||
client_cred: str,
|
client_cred: str,
|
||||||
|
permissions_application_id: t.Optional[str],
|
||||||
|
permissions_client_cred: t.Optional[str],
|
||||||
|
permissions_tenant: t.Optional[str],
|
||||||
path: str,
|
path: str,
|
||||||
files_only: bool = False,
|
files_only: bool = False,
|
||||||
recursive: bool = False,
|
recursive: bool = False,
|
||||||
@ -31,10 +35,17 @@ class SharePointRunner(Runner):
|
|||||||
)
|
)
|
||||||
|
|
||||||
from unstructured.ingest.connector.sharepoint import (
|
from unstructured.ingest.connector.sharepoint import (
|
||||||
|
SharepointPermissionsConfig,
|
||||||
SharepointSourceConnector,
|
SharepointSourceConnector,
|
||||||
SimpleSharepointConfig,
|
SimpleSharepointConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
permissions_config = SharepointPermissionsConfig(
|
||||||
|
application_id=permissions_application_id,
|
||||||
|
client_cred=permissions_client_cred,
|
||||||
|
tenant=permissions_tenant,
|
||||||
|
)
|
||||||
|
|
||||||
source_doc_connector = SharepointSourceConnector( # type: ignore
|
source_doc_connector = SharepointSourceConnector( # type: ignore
|
||||||
processor_config=self.processor_config,
|
processor_config=self.processor_config,
|
||||||
connector_config=SimpleSharepointConfig(
|
connector_config=SimpleSharepointConfig(
|
||||||
@ -44,6 +55,7 @@ class SharePointRunner(Runner):
|
|||||||
path=path,
|
path=path,
|
||||||
process_pages=(not files_only),
|
process_pages=(not files_only),
|
||||||
recursive=recursive,
|
recursive=recursive,
|
||||||
|
permissions_config=permissions_config,
|
||||||
),
|
),
|
||||||
read_config=self.read_config,
|
read_config=self.read_config,
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user