mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	feat: add file-based access permissions for SharePoint ingest (#1628)
This PR: - defines rbac_data as a SourceMetadata field, - manages connections to an external api for obtaining rbac data with ConnectorRBAC class, - serializes rbac data and saves it to the disk, - matches the rbac_data in the disk to each IngestDoc, using a common field, - forwards rbac data to Elements, via the partition() function To test the changes, run `examples/ingest/sharepoint/ingest.sh` with the relevant rbac & connector credentials --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									3ec3673d34
								
							
						
					
					
						commit
						94836cfad4
					
				
							
								
								
									
										3
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							@ -288,6 +288,9 @@ jobs:
 | 
			
		||||
        SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
 | 
			
		||||
        SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
 | 
			
		||||
        SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
 | 
			
		||||
        SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
 | 
			
		||||
        SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
 | 
			
		||||
        SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
 | 
			
		||||
        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
 | 
			
		||||
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
			
		||||
        NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
 | 
			
		||||
 | 
			
		||||
@ -85,6 +85,9 @@ jobs:
 | 
			
		||||
          SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
 | 
			
		||||
          SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
 | 
			
		||||
          SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
 | 
			
		||||
          SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
 | 
			
		||||
          SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
 | 
			
		||||
          SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
 | 
			
		||||
          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
 | 
			
		||||
          UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
			
		||||
          NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
 | 
			
		||||
 | 
			
		||||
@ -8,9 +8,11 @@
 | 
			
		||||
### Features
 | 
			
		||||
* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
 | 
			
		||||
 | 
			
		||||
* **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector.
 | 
			
		||||
 | 
			
		||||
### Fixes
 | 
			
		||||
 | 
			
		||||
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list items
 | 
			
		||||
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list item
 | 
			
		||||
* **Fixes duplicated elements** Fixes issue where elements are duplicated when embeddings are generated. This will allow users to generate embeddings for their list of Elements without duplicating/breaking the orginal content.
 | 
			
		||||
* **Fixes failure when flagging for embeddings through unstructured-ingest** Currently adding the embedding parameter to any connector results in a failure on the copy stage. This is resolves the issue by adding the IngestDoc to the context map in the embedding node's `run` method. This allows users to specify that connectors fetch embeddings without failure.
 | 
			
		||||
* **Fix ingest pipeline reformat nodes not discoverable** Fixes issue where  reformat nodes raise ModuleNotFoundError on import. This was due to the directory was missing `__init__.py` in order to make it discoverable.
 | 
			
		||||
 | 
			
		||||
@ -22,6 +22,9 @@ Run Locally
 | 
			
		||||
          --client-id "<Microsoft Sharepoint app client-id>" \
 | 
			
		||||
          --client-cred "<Microsoft Sharepoint app client-secret>" \
 | 
			
		||||
          --site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
 | 
			
		||||
          --permissions-application-id "<Microsoft Graph API application id, to process per-file access permissions>" \
 | 
			
		||||
          --permissions-client-cred "<Microsoft Graph API application credentials, to process per-file access permissions>" \
 | 
			
		||||
          --permissions-tenant "<e.g https://contoso.onmicrosoft.com (tenant URL) to process per-file access permissions>" \
 | 
			
		||||
          --files-only "Flag to process only files within the site(s)" \
 | 
			
		||||
          --output-dir sharepoint-ingest-output \
 | 
			
		||||
          --num-processes 2 \
 | 
			
		||||
@ -46,6 +49,10 @@ Run Locally
 | 
			
		||||
                client_id="<Microsoft Sharepoint app client-id>",
 | 
			
		||||
                client_cred="<Microsoft Sharepoint app client-secret>",
 | 
			
		||||
                site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
 | 
			
		||||
                # Credentials to process data about permissions (rbac) within the tenant
 | 
			
		||||
                permissions_application_id="<Microsoft Graph API application id>",
 | 
			
		||||
                permissions_client_cred="<Microsoft Graph API application credentials>",
 | 
			
		||||
                permissions_tenant="<e.g https://contoso.onmicrosoft.com to process permission info within tenant>",
 | 
			
		||||
                # Flag to process only files within the site(s)
 | 
			
		||||
                files_only=True,
 | 
			
		||||
                path="Shared Documents",
 | 
			
		||||
@ -68,6 +75,9 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
 | 
			
		||||
          --client-id "<Microsoft Sharepoint app client-id>" \
 | 
			
		||||
          --client-cred "<Microsoft Sharepoint app client-secret>" \
 | 
			
		||||
          --site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
 | 
			
		||||
          --permissions-application-id "<Microsoft Graph API application id, to process per-file access permissions>" \
 | 
			
		||||
          --permissions-client-cred "<Microsoft Graph API application credentials, to process per-file access permissions>" \
 | 
			
		||||
          --permissions-tenant "<e.g https://contoso.onmicrosoft.com (tenant URL) to process per-file access permissions>" \
 | 
			
		||||
          --files-only "Flag to process only files within the site(s)" \
 | 
			
		||||
          --output-dir sharepoint-ingest-output \
 | 
			
		||||
          --num-processes 2 \
 | 
			
		||||
@ -98,6 +108,10 @@ You can also use upstream connectors with the ``unstructured`` API. For this you
 | 
			
		||||
                client_id="<Microsoft Sharepoint app client-id>",
 | 
			
		||||
                client_cred="<Microsoft Sharepoint app client-secret>",
 | 
			
		||||
                site="<e.g https://contoso.sharepoint.com to process all sites within tenant>",
 | 
			
		||||
                # Credentials to process data about permissions (rbac) within the tenant
 | 
			
		||||
                permissions_application_id="<Microsoft Graph API application id>",
 | 
			
		||||
                permissions_client_cred="<Microsoft Graph API application credentials>",
 | 
			
		||||
                permissions_tenant="<e.g https://contoso.onmicrosoft.com to process permission info within tenant>",
 | 
			
		||||
                # Flag to process only files within the site(s)
 | 
			
		||||
                files_only=True,
 | 
			
		||||
                path="Shared Documents",
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										5
									
								
								examples/ingest/sharepoint/ingest.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										5
									
								
								examples/ingest/sharepoint/ingest.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@ -12,6 +12,8 @@
 | 
			
		||||
# To get the credentials for your Sharepoint app, follow these steps:
 | 
			
		||||
# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
 | 
			
		||||
 | 
			
		||||
# To optionally set up your application and obtain permissions related variables (--permissions-application-id, --permissions-client-cred, --permissions-tenant), follow these steps:
 | 
			
		||||
# https://tsmatz.wordpress.com/2016/10/07/application-permission-with-v2-endpoint-and-microsoft-graph
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 | 
			
		||||
@ -22,6 +24,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
			
		||||
    --client-id "<Microsoft Sharepoint app client-id>" \
 | 
			
		||||
    --client-cred "<Microsoft Sharepoint app client-secret>" \
 | 
			
		||||
    --site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
 | 
			
		||||
    --permissions-application-id "<Microsoft Graph API application id to process per-file access permissions>" \
 | 
			
		||||
    --permissions-client-cred "<Microsoft Graph API application credentials to process per-file access permissions>" \
 | 
			
		||||
    --permissions-tenant "<e.g https://contoso.onmicrosoft.com to process per-file access permissions>" \
 | 
			
		||||
    --files-only "Flag to process only files within the site(s)" \
 | 
			
		||||
    --output-dir sharepoint-ingest-output \
 | 
			
		||||
    --num-processes 2 \
 | 
			
		||||
 | 
			
		||||
@ -46,6 +46,7 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
 | 
			
		||||
elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then
 | 
			
		||||
    "$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT"
 | 
			
		||||
    "$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT"
 | 
			
		||||
    "$SCRIPT_DIR"/clean-permissions-files.sh "$OUTPUT_DIR_TEXT"
 | 
			
		||||
    diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt
 | 
			
		||||
    cat outputdiff.txt
 | 
			
		||||
    diffstat -c outputdiff.txt
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										27
									
								
								test_unstructured_ingest/clean-permissions-files.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										27
									
								
								test_unstructured_ingest/clean-permissions-files.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,27 @@
 | 
			
		||||
#!/usr/bin/env bash
 | 
			
		||||
 | 
			
		||||
# Description: Delete (cleanup) permissions files in a folder, so that they are not included in
 | 
			
		||||
#              text diff tests.
 | 
			
		||||
#
 | 
			
		||||
# Arguments:
 | 
			
		||||
#   - $1: Name of the folder to do the cleanup operation in.
 | 
			
		||||
 | 
			
		||||
set +e
 | 
			
		||||
if [ "$#" -ne 1 ]; then
 | 
			
		||||
    echo "Please provide a folder to clean the files in: $0 <folder_path>"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
folder_path="$1"
 | 
			
		||||
if [ ! -d "$folder_path" ]; then
 | 
			
		||||
    echo "'$folder_path' is not a directory. Please provide a folder / directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
for file in "$folder_path"/*_SEP_*; do
 | 
			
		||||
    if [ -e "$file" ]; then
 | 
			
		||||
        rm "$file"
 | 
			
		||||
    fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
echo "Completed cleanup for permissions files"
 | 
			
		||||
@ -11,7 +11,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:55",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "fake-text.txt",
 | 
			
		||||
      "filetype": "text/plain",
 | 
			
		||||
@ -33,7 +121,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:55",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "fake-text.txt",
 | 
			
		||||
      "filetype": "text/plain",
 | 
			
		||||
@ -55,7 +231,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:55",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "fake-text.txt",
 | 
			
		||||
      "filetype": "text/plain",
 | 
			
		||||
@ -77,7 +341,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:55",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "fake-text.txt",
 | 
			
		||||
      "filetype": "text/plain",
 | 
			
		||||
@ -99,7 +451,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:55",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "fake-text.txt",
 | 
			
		||||
      "filetype": "text/plain",
 | 
			
		||||
@ -121,7 +561,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:55",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:55",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "fake-text.txt",
 | 
			
		||||
      "filetype": "text/plain",
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:04:47",
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:47"
 | 
			
		||||
        "date_modified": "2023-06-16T05:04:47",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "ideas-page.html",
 | 
			
		||||
      "filetype": "text/html",
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:05:05",
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05"
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "stanley-cups.xlsx",
 | 
			
		||||
      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 | 
			
		||||
@ -36,7 +124,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:05:05",
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05"
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "stanley-cups.xlsx",
 | 
			
		||||
      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 | 
			
		||||
@ -61,7 +237,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:05:05",
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05"
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "stanley-cups.xlsx",
 | 
			
		||||
      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 | 
			
		||||
@ -86,7 +350,95 @@
 | 
			
		||||
          "site_url": "https://unstructuredio.sharepoint.com"
 | 
			
		||||
        },
 | 
			
		||||
        "date_created": "2023-06-16T05:05:05",
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05"
 | 
			
		||||
        "date_modified": "2023-06-16T05:05:05",
 | 
			
		||||
        "permissions_data": [
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE93bmVycw",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Owners",
 | 
			
		||||
                "id": "3",
 | 
			
		||||
                "loginName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Owners"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "read"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIFZpc2l0b3Jz",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Visitors",
 | 
			
		||||
                "id": "4",
 | 
			
		||||
                "loginName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Visitors"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "write"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "Q29tbXVuaWNhdGlvbiBzaXRlIE1lbWJlcnM",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "siteGroup": {
 | 
			
		||||
                "displayName": "Communication site Members",
 | 
			
		||||
                "id": "5",
 | 
			
		||||
                "loginName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Communication site Members"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          },
 | 
			
		||||
          {
 | 
			
		||||
            "id": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "roles": [
 | 
			
		||||
              "owner"
 | 
			
		||||
            ],
 | 
			
		||||
            "shareId": "YzowdC5jfHRlbmFudHxmYzA5NTU5Mi1jMjVhLTRhYzctYjJhMS0zMWUwZDAxZTRiNTE",
 | 
			
		||||
            "grantedToV2": {
 | 
			
		||||
              "group": {
 | 
			
		||||
                "@odata.type": "#microsoft.graph.sharePointIdentity",
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              },
 | 
			
		||||
              "siteUser": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "7",
 | 
			
		||||
                "loginName": "c:0t.c|tenant|fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "grantedTo": {
 | 
			
		||||
              "user": {
 | 
			
		||||
                "displayName": "Global Administrator",
 | 
			
		||||
                "id": "fc095592-c25a-4ac7-b2a1-31e0d01e4b51"
 | 
			
		||||
              }
 | 
			
		||||
            },
 | 
			
		||||
            "inheritedFrom": {}
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      },
 | 
			
		||||
      "filename": "stanley-cups.xlsx",
 | 
			
		||||
      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 | 
			
		||||
 | 
			
		||||
@ -19,6 +19,11 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then
 | 
			
		||||
   exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
 | 
			
		||||
   echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
 | 
			
		||||
   exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [ -z "$OPENAI_API_KEY" ]; then
 | 
			
		||||
   echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set."
 | 
			
		||||
   exit 0
 | 
			
		||||
@ -85,6 +90,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
			
		||||
    --client-cred "$SHAREPOINT_CRED" \
 | 
			
		||||
    --client-id "$SHAREPOINT_CLIENT_ID" \
 | 
			
		||||
    --site "$SHAREPOINT_SITE" \
 | 
			
		||||
    --permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \
 | 
			
		||||
    --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \
 | 
			
		||||
    --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \
 | 
			
		||||
    --path "Shared Documents" \
 | 
			
		||||
    --recursive \
 | 
			
		||||
    --embedding-api-key "$OPENAI_API_KEY" \
 | 
			
		||||
 | 
			
		||||
@ -26,6 +26,12 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
 | 
			
		||||
   echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
 | 
			
		||||
   exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
 | 
			
		||||
   echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
 | 
			
		||||
   exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly
 | 
			
		||||
PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
			
		||||
    sharepoint \
 | 
			
		||||
@ -40,6 +46,9 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
			
		||||
    --client-cred "$SHAREPOINT_CRED" \
 | 
			
		||||
    --client-id "$SHAREPOINT_CLIENT_ID" \
 | 
			
		||||
    --site "$SHAREPOINT_SITE" \
 | 
			
		||||
    --permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \
 | 
			
		||||
    --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \
 | 
			
		||||
    --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \
 | 
			
		||||
    --path "Shared Documents" \
 | 
			
		||||
    --recursive \
 | 
			
		||||
    --work-dir "$WORK_DIR"
 | 
			
		||||
 | 
			
		||||
@ -45,6 +45,7 @@ class DataSourceMetadata:
 | 
			
		||||
    date_created: Optional[str] = None
 | 
			
		||||
    date_modified: Optional[str] = None
 | 
			
		||||
    date_processed: Optional[str] = None
 | 
			
		||||
    permissions_data: Optional[List[Dict[str, Any]]] = None
 | 
			
		||||
 | 
			
		||||
    def to_dict(self):
 | 
			
		||||
        return {key: value for key, value in self.__dict__.items() if value is not None}
 | 
			
		||||
 | 
			
		||||
@ -11,6 +11,7 @@ from unstructured.ingest.interfaces import (
 | 
			
		||||
    ChunkingConfig,
 | 
			
		||||
    EmbeddingConfig,
 | 
			
		||||
    PartitionConfig,
 | 
			
		||||
    PermissionsConfig,
 | 
			
		||||
    ProcessorConfig,
 | 
			
		||||
    ReadConfig,
 | 
			
		||||
)
 | 
			
		||||
@ -287,12 +288,12 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
 | 
			
		||||
        This allows CLI arguments to be prepended with chunk_ during CLI invocation but
 | 
			
		||||
        This allows CLI arguments to be prepended with embedding_ during CLI invocation but
 | 
			
		||||
        doesn't require that as part of the field names in this class
 | 
			
		||||
        """
 | 
			
		||||
        if isinstance(kvs, dict):
 | 
			
		||||
            new_kvs = {
 | 
			
		||||
                k[len("embedding-") :]: v  # noqa: E203
 | 
			
		||||
                k[len("embedding_") :]: v  # noqa: E203
 | 
			
		||||
                for k, v in kvs.items()
 | 
			
		||||
                if k.startswith("embedding_")
 | 
			
		||||
            }
 | 
			
		||||
@ -363,3 +364,75 @@ class CliChunkingConfig(ChunkingConfig, CliMixin):
 | 
			
		||||
                return None
 | 
			
		||||
            return _decode_dataclass(cls, new_kvs, infer_missing)
 | 
			
		||||
        return _decode_dataclass(cls, kvs, infer_missing)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CliPermissionsConfig(PermissionsConfig, CliMixin):
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def add_cli_options(cmd: click.Command) -> None:
 | 
			
		||||
        options = [
 | 
			
		||||
            click.Option(
 | 
			
		||||
                ["--permissions-application-id"],
 | 
			
		||||
                type=str,
 | 
			
		||||
                help="Microsoft Graph API application id",
 | 
			
		||||
            ),
 | 
			
		||||
            click.Option(
 | 
			
		||||
                ["--permissions-client-cred"],
 | 
			
		||||
                type=str,
 | 
			
		||||
                help="Microsoft Graph API application credentials",
 | 
			
		||||
            ),
 | 
			
		||||
            click.Option(
 | 
			
		||||
                ["--permissions-tenant"],
 | 
			
		||||
                type=str,
 | 
			
		||||
                help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
 | 
			
		||||
            ),
 | 
			
		||||
        ]
 | 
			
		||||
        cmd.params.extend(options)
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def from_dict(
 | 
			
		||||
        cls,
 | 
			
		||||
        kvs: Json,
 | 
			
		||||
        *,
 | 
			
		||||
        infer_missing=False,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
 | 
			
		||||
        This allows CLI arguments to be prepended with permissions_ during CLI invocation but
 | 
			
		||||
        doesn't require that as part of the field names in this class. It also checks if the
 | 
			
		||||
        CLI params are provided as intended.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if (
 | 
			
		||||
            isinstance(kvs, dict)
 | 
			
		||||
            and any(
 | 
			
		||||
                [
 | 
			
		||||
                    kvs["permissions_application_id"]
 | 
			
		||||
                    or kvs["permissions_client_cred"]
 | 
			
		||||
                    or kvs["permissions_tenant"],
 | 
			
		||||
                ],
 | 
			
		||||
            )
 | 
			
		||||
            and not all(
 | 
			
		||||
                [
 | 
			
		||||
                    kvs["permissions_application_id"]
 | 
			
		||||
                    and kvs["permissions_client_cred"]
 | 
			
		||||
                    and kvs["permissions_tenant"],
 | 
			
		||||
                ],
 | 
			
		||||
            )
 | 
			
		||||
        ):
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "Please provide either none or all of the following optional values:\n"
 | 
			
		||||
                "--permissions-application-id\n"
 | 
			
		||||
                "--permissions-client-cred\n"
 | 
			
		||||
                "--permissions-tenant",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if isinstance(kvs, dict):
 | 
			
		||||
            new_kvs = {
 | 
			
		||||
                k[len("permissions_") :]: v  # noqa: E203
 | 
			
		||||
                for k, v in kvs.items()
 | 
			
		||||
                if k.startswith("permissions_")
 | 
			
		||||
            }
 | 
			
		||||
            if len(new_kvs.keys()) == 0:
 | 
			
		||||
                return None
 | 
			
		||||
            return _decode_dataclass(cls, new_kvs, infer_missing)
 | 
			
		||||
        return _decode_dataclass(cls, kvs, infer_missing)
 | 
			
		||||
 | 
			
		||||
@ -8,6 +8,7 @@ from unstructured.ingest.cli.interfaces import (
 | 
			
		||||
    CliEmbeddingConfig,
 | 
			
		||||
    CliMixin,
 | 
			
		||||
    CliPartitionConfig,
 | 
			
		||||
    CliPermissionsConfig,
 | 
			
		||||
    CliProcessorConfig,
 | 
			
		||||
    CliReadConfig,
 | 
			
		||||
)
 | 
			
		||||
@ -39,6 +40,7 @@ def extract_configs(
 | 
			
		||||
        "embedding_config": CliEmbeddingConfig.from_dict(data),
 | 
			
		||||
        "chunking_config": CliChunkingConfig.from_dict(data),
 | 
			
		||||
        "processor_config": CliProcessorConfig.from_dict(data),
 | 
			
		||||
        "permissions_config": CliPermissionsConfig.from_dict(data),
 | 
			
		||||
    }
 | 
			
		||||
    for v in validate:
 | 
			
		||||
        v.from_dict(data)
 | 
			
		||||
@ -52,6 +54,7 @@ def add_options(cmd: click.Command, extras=t.List[t.Type[CliMixin]]) -> click.Co
 | 
			
		||||
        CliEmbeddingConfig,
 | 
			
		||||
        CliChunkingConfig,
 | 
			
		||||
        CliProcessorConfig,
 | 
			
		||||
        CliPermissionsConfig,
 | 
			
		||||
    ]
 | 
			
		||||
    configs.extend(extras)
 | 
			
		||||
    for config in configs:
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,5 @@
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import typing as t
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
@ -15,6 +17,7 @@ from unstructured.ingest.interfaces import (
 | 
			
		||||
    SourceConnectorCleanupMixin,
 | 
			
		||||
    SourceMetadata,
 | 
			
		||||
)
 | 
			
		||||
from unstructured.ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig
 | 
			
		||||
from unstructured.ingest.logger import logger
 | 
			
		||||
from unstructured.utils import requires_dependencies
 | 
			
		||||
 | 
			
		||||
@ -35,6 +38,7 @@ class SimpleSharepointConfig(BaseConnectorConfig):
 | 
			
		||||
    path: str
 | 
			
		||||
    process_pages: bool = False
 | 
			
		||||
    recursive: bool = False
 | 
			
		||||
    permissions_config: t.Optional[SharepointPermissionsConfig] = None
 | 
			
		||||
 | 
			
		||||
    def __post_init__(self):
 | 
			
		||||
        if not (self.client_id and self.client_credential and self.site_url):
 | 
			
		||||
@ -57,6 +61,14 @@ class SimpleSharepointConfig(BaseConnectorConfig):
 | 
			
		||||
            raise
 | 
			
		||||
        return site_client
 | 
			
		||||
 | 
			
		||||
    def get_permissions_client(self):
 | 
			
		||||
        try:
 | 
			
		||||
            permissions_connector = SharepointPermissionsConnector(self.permissions_config)
 | 
			
		||||
            assert permissions_connector.access_token
 | 
			
		||||
            return permissions_connector
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
 | 
			
		||||
@ -122,7 +134,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
 | 
			
		||||
                file = site_client.web.get_file_by_server_relative_url(self.server_path)
 | 
			
		||||
                if properties_only:
 | 
			
		||||
                    file = file.get().execute_query()
 | 
			
		||||
 | 
			
		||||
        except ClientRequestException as e:
 | 
			
		||||
            if e.response.status_code == 404:
 | 
			
		||||
                return None
 | 
			
		||||
@ -144,6 +155,44 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
 | 
			
		||||
            return None
 | 
			
		||||
        return page
 | 
			
		||||
 | 
			
		||||
    def update_permissions_data(self):
 | 
			
		||||
        def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath):
 | 
			
		||||
            permissions_filename = permissions_filename.split("_SEP_")
 | 
			
		||||
            ingest_doc_filepath = ingest_doc_filepath.split("/")
 | 
			
		||||
 | 
			
		||||
            if parent_type == "sites":
 | 
			
		||||
                return permissions_filename[0] == ingest_doc_filepath[1]
 | 
			
		||||
 | 
			
		||||
            elif parent_type == "SitePages" or parent_type == "Shared Documents":
 | 
			
		||||
                return True
 | 
			
		||||
 | 
			
		||||
        permissions_data = None
 | 
			
		||||
        permissions_dir = Path(self.processor_config.output_dir) / "permissions_data"
 | 
			
		||||
 | 
			
		||||
        if permissions_dir.is_dir():
 | 
			
		||||
            parent_type = self.file_path.split("/")[0]
 | 
			
		||||
 | 
			
		||||
            if parent_type == "sites":
 | 
			
		||||
                read_dir = permissions_dir / "sites"
 | 
			
		||||
            elif parent_type == "SitePages" or parent_type == "Shared Documents":
 | 
			
		||||
                read_dir = permissions_dir / "other"
 | 
			
		||||
            else:
 | 
			
		||||
                read_dir = permissions_dir / "other"
 | 
			
		||||
 | 
			
		||||
            for filename in os.listdir(read_dir):
 | 
			
		||||
                permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1]
 | 
			
		||||
                ingestdoc_docname = self.file_path.split("/")[-1]
 | 
			
		||||
 | 
			
		||||
                if ingestdoc_docname == permissions_docname and parent_name_matches(
 | 
			
		||||
                    parent_type=parent_type,
 | 
			
		||||
                    permissions_filename=filename,
 | 
			
		||||
                    ingest_doc_filepath=self.file_path,
 | 
			
		||||
                ):
 | 
			
		||||
                    with open(read_dir / filename) as f:
 | 
			
		||||
                        permissions_data = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
        return permissions_data
 | 
			
		||||
 | 
			
		||||
    def update_source_metadata(self, **kwargs):
 | 
			
		||||
        if self.is_page:
 | 
			
		||||
            page = self._fetch_page()
 | 
			
		||||
@ -158,6 +207,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
 | 
			
		||||
                version=page.get_property("Version", ""),
 | 
			
		||||
                source_url=page.absolute_url,
 | 
			
		||||
                exists=True,
 | 
			
		||||
                permissions_data=self.update_permissions_data()
 | 
			
		||||
                if self.connector_config.permissions_config
 | 
			
		||||
                else None,
 | 
			
		||||
            )
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
@ -176,6 +228,9 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
 | 
			
		||||
            version=file.major_version,
 | 
			
		||||
            source_url=file.properties.get("LinkingUrl", None),
 | 
			
		||||
            exists=True,
 | 
			
		||||
            permissions_data=self.update_permissions_data()
 | 
			
		||||
            if self.connector_config.permissions_config
 | 
			
		||||
            else None,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _download_page(self):
 | 
			
		||||
@ -317,6 +372,12 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
 | 
			
		||||
 | 
			
		||||
    def get_ingest_docs(self):
 | 
			
		||||
        base_site_client = self.connector_config.get_site_client()
 | 
			
		||||
 | 
			
		||||
        if self.connector_config.permissions_config:
 | 
			
		||||
            permissions_client = self.connector_config.get_permissions_client()
 | 
			
		||||
            if permissions_client:
 | 
			
		||||
                permissions_client.write_all_permissions(self.processor_config.output_dir)
 | 
			
		||||
 | 
			
		||||
        if not base_site_client.is_tenant:
 | 
			
		||||
            return self._ingest_site_docs(base_site_client)
 | 
			
		||||
        tenant = base_site_client.tenant
 | 
			
		||||
@ -328,3 +389,166 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
 | 
			
		||||
            site_client = self.connector_config.get_site_client(site_url)
 | 
			
		||||
            ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
 | 
			
		||||
        return ingest_docs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class SharepointPermissionsConnector:
 | 
			
		||||
    def __init__(self, permissions_config):
 | 
			
		||||
        self.permissions_config: SharepointPermissionsConfig = permissions_config
 | 
			
		||||
        self.initialize()
 | 
			
		||||
 | 
			
		||||
    def initialize(self):
 | 
			
		||||
        self.access_token: str = self.get_access_token()
 | 
			
		||||
 | 
			
		||||
    @requires_dependencies(["requests"], extras="sharepoint")
 | 
			
		||||
    def get_access_token(self) -> str:
 | 
			
		||||
        import requests
 | 
			
		||||
 | 
			
		||||
        url = (
 | 
			
		||||
            f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token"
 | 
			
		||||
        )
 | 
			
		||||
        headers = {"Content-Type": "application/x-www-form-urlencoded"}
 | 
			
		||||
        data = {
 | 
			
		||||
            "client_id": self.permissions_config.application_id,
 | 
			
		||||
            "scope": "https://graph.microsoft.com/.default",
 | 
			
		||||
            "client_secret": self.permissions_config.client_cred,
 | 
			
		||||
            "grant_type": "client_credentials",
 | 
			
		||||
        }
 | 
			
		||||
        response = requests.post(url, headers=headers, data=data)
 | 
			
		||||
        return response.json()["access_token"]
 | 
			
		||||
 | 
			
		||||
    def validated_response(self, response):
 | 
			
		||||
        if response.status_code == 200:
 | 
			
		||||
            return response.json()
 | 
			
		||||
        else:
 | 
			
		||||
            print(f"Request failed with status code {response.status_code}:")
 | 
			
		||||
            print(response.text)
 | 
			
		||||
 | 
			
		||||
    @requires_dependencies(["requests"], extras="sharepoint")
 | 
			
		||||
    def get_sites(self):
 | 
			
		||||
        import requests
 | 
			
		||||
 | 
			
		||||
        url = "https://graph.microsoft.com/v1.0/sites"
 | 
			
		||||
        params = {
 | 
			
		||||
            "$select": "webUrl, id",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        headers = {
 | 
			
		||||
            "Authorization": f"Bearer {self.access_token}",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        response = requests.get(url, params=params, headers=headers)
 | 
			
		||||
        return self.validated_response(response)
 | 
			
		||||
 | 
			
		||||
    @requires_dependencies(["requests"], extras="sharepoint")
 | 
			
		||||
    def get_drives(self, site):
 | 
			
		||||
        import requests
 | 
			
		||||
 | 
			
		||||
        url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives"
 | 
			
		||||
 | 
			
		||||
        headers = {
 | 
			
		||||
            "Authorization": f"Bearer {self.access_token}",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        response = requests.get(url, headers=headers)
 | 
			
		||||
 | 
			
		||||
        return self.validated_response(response)
 | 
			
		||||
 | 
			
		||||
    @requires_dependencies(["requests"], extras="sharepoint")
 | 
			
		||||
    def get_drive_items(self, site, drive_id):
 | 
			
		||||
        import requests
 | 
			
		||||
 | 
			
		||||
        url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children"
 | 
			
		||||
 | 
			
		||||
        headers = {
 | 
			
		||||
            "Authorization": f"Bearer {self.access_token}",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        response = requests.get(url, headers=headers)
 | 
			
		||||
 | 
			
		||||
        return self.validated_response(response)
 | 
			
		||||
 | 
			
		||||
    def extract_site_name_from_weburl(self, weburl):
 | 
			
		||||
        split_path = urlparse(weburl).path.lstrip("/").split("/")
 | 
			
		||||
 | 
			
		||||
        if split_path[0] == "sites":
 | 
			
		||||
            return "sites", split_path[1]
 | 
			
		||||
 | 
			
		||||
        elif split_path[0] == "Shared%20Documents":
 | 
			
		||||
            return "Shared Documents", "Shared Documents"
 | 
			
		||||
 | 
			
		||||
        elif split_path[0] == "personal":
 | 
			
		||||
            return "Personal", "Personal"
 | 
			
		||||
 | 
			
		||||
        elif split_path[0] == "_layouts":
 | 
			
		||||
            return "layouts", "layouts"
 | 
			
		||||
 | 
			
		||||
        # if other weburl structures are found, additional logic might need to be implemented
 | 
			
		||||
 | 
			
		||||
        logger.warning(
 | 
			
		||||
            """Couldn't extract sitename, unknown site or parent type. Skipping permissions
 | 
			
		||||
            ingestion for the document with the URL:""",
 | 
			
		||||
            weburl,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        return None, None
 | 
			
		||||
 | 
			
		||||
    @requires_dependencies(["requests"], extras="sharepoint")
 | 
			
		||||
    def get_permissions_for_drive_item(self, site, drive_id, item_id):
 | 
			
		||||
        import requests
 | 
			
		||||
 | 
			
		||||
        url = f"https://graph.microsoft.com/v1.0/sites/ \
 | 
			
		||||
        {site}/drives/{drive_id}/items/{item_id}/permissions"
 | 
			
		||||
 | 
			
		||||
        headers = {
 | 
			
		||||
            "Authorization": f"Bearer {self.access_token}",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        response = requests.get(url, headers=headers)
 | 
			
		||||
 | 
			
		||||
        return self.validated_response(response)
 | 
			
		||||
 | 
			
		||||
    def write_all_permissions(self, output_dir):
 | 
			
		||||
        sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]]
 | 
			
		||||
        drive_ids = []
 | 
			
		||||
 | 
			
		||||
        print("Obtaining drive data for sites for permissions (rbac)")
 | 
			
		||||
        for site_id, site_url in sites:
 | 
			
		||||
            drives = self.get_drives(site_id)
 | 
			
		||||
            if drives:
 | 
			
		||||
                drives_for_site = drives["value"]
 | 
			
		||||
                drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site])
 | 
			
		||||
 | 
			
		||||
        print("Obtaining item data from drives for permissions (rbac)")
 | 
			
		||||
        item_ids = []
 | 
			
		||||
        for site, drive_id in drive_ids:
 | 
			
		||||
            drive_items = self.get_drive_items(site, drive_id)
 | 
			
		||||
            if drive_items:
 | 
			
		||||
                item_ids.extend(
 | 
			
		||||
                    [
 | 
			
		||||
                        (site, drive_id, item["id"], item["name"], item["webUrl"])
 | 
			
		||||
                        for item in drive_items["value"]
 | 
			
		||||
                    ],
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        permissions_dir = Path(output_dir) / "permissions_data"
 | 
			
		||||
 | 
			
		||||
        print("Writing permissions data to disk")
 | 
			
		||||
        for site, drive_id, item_id, item_name, item_web_url in item_ids:
 | 
			
		||||
            res = self.get_permissions_for_drive_item(site, drive_id, item_id)
 | 
			
		||||
            if res:
 | 
			
		||||
                parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url)
 | 
			
		||||
 | 
			
		||||
                if parent_type == "sites":
 | 
			
		||||
                    write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json"
 | 
			
		||||
 | 
			
		||||
                elif parent_type == "Personal" or parent_type == "Shared Documents":
 | 
			
		||||
                    write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
 | 
			
		||||
                else:
 | 
			
		||||
                    write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
 | 
			
		||||
 | 
			
		||||
                if not Path(os.path.dirname(write_path)).is_dir():
 | 
			
		||||
                    os.makedirs(os.path.dirname(write_path))
 | 
			
		||||
 | 
			
		||||
                with open(write_path, "w") as f:
 | 
			
		||||
                    json.dump(res["value"], f)
 | 
			
		||||
 | 
			
		||||
@ -107,6 +107,14 @@ class ChunkingConfig(BaseConfig):
 | 
			
		||||
            return elements
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class PermissionsConfig(BaseConfig):
 | 
			
		||||
    application_id: t.Optional[str]
 | 
			
		||||
    client_cred: t.Optional[str]
 | 
			
		||||
    tenant: t.Optional[str]
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class WriteConfig(BaseConfig):
 | 
			
		||||
    pass
 | 
			
		||||
@ -123,6 +131,7 @@ class SourceMetadata(DataClassJsonMixin, ABC):
 | 
			
		||||
    version: t.Optional[str] = None
 | 
			
		||||
    source_url: t.Optional[str] = None
 | 
			
		||||
    exists: t.Optional[bool] = None
 | 
			
		||||
    permissions_data: t.Optional[t.List[t.Dict[str, t.Any]]] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
@ -212,6 +221,13 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
 | 
			
		||||
        the version of the document."""
 | 
			
		||||
        return self.source_metadata.version  # type: ignore
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def permissions_data(self) -> t.Optional[t.List[t.Dict[str, t.Any]]]:
 | 
			
		||||
        """Access control data, aka permissions or sharing, from the source system."""
 | 
			
		||||
        if self.source_metadata is None:
 | 
			
		||||
            self.update_source_metadata()
 | 
			
		||||
        return self.source_metadata.permissions_data  # type: ignore
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def cleanup_file(self):
 | 
			
		||||
        """Removes the local copy the file (or anything else) after successful processing."""
 | 
			
		||||
@ -240,6 +256,12 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
 | 
			
		||||
        """Sets the SourceMetadata and the  properties for the doc"""
 | 
			
		||||
        self._source_metadata = SourceMetadata()
 | 
			
		||||
 | 
			
		||||
    def update_permissions_data(self):
 | 
			
		||||
        """Sets the _permissions_data property for the doc.
 | 
			
		||||
        This property is later used to fill the corresponding SourceMetadata.permissions_data field,
 | 
			
		||||
        and after that carries on to the permissions_data property."""
 | 
			
		||||
        self._permissions_data: t.Optional[t.List[t.Dict]] = None
 | 
			
		||||
 | 
			
		||||
    # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
 | 
			
		||||
    # in addition to or instead of get_file()
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
@ -269,6 +291,7 @@ class BaseIngestDoc(IngestDocJsonMixin, ABC):
 | 
			
		||||
                    date_created=self.date_created,
 | 
			
		||||
                    date_modified=self.date_modified,
 | 
			
		||||
                    date_processed=self.date_processed,
 | 
			
		||||
                    permissions_data=self.permissions_data,
 | 
			
		||||
                ),
 | 
			
		||||
                **partition_kwargs,
 | 
			
		||||
            )
 | 
			
		||||
@ -420,6 +443,38 @@ class SourceConnectorCleanupMixin:
 | 
			
		||||
            os.rmdir(cur_dir)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PermissionsCleanupMixin:
 | 
			
		||||
    processor_config: ProcessorConfig
 | 
			
		||||
 | 
			
		||||
    def cleanup_permissions(self, cur_dir=None):
 | 
			
		||||
        def has_no_folders(folder_path):
 | 
			
		||||
            folders = [
 | 
			
		||||
                item
 | 
			
		||||
                for item in os.listdir(folder_path)
 | 
			
		||||
                if os.path.isdir(os.path.join(folder_path, item))
 | 
			
		||||
            ]
 | 
			
		||||
            return len(folders) == 0
 | 
			
		||||
 | 
			
		||||
        """Recursively clean up downloaded files and directories."""
 | 
			
		||||
        if cur_dir is None:
 | 
			
		||||
            cur_dir = Path(self.processor_config.output_dir, "permissions_data")
 | 
			
		||||
        if cur_dir is None:
 | 
			
		||||
            return
 | 
			
		||||
        if Path(cur_dir).is_file():
 | 
			
		||||
            cur_file = cur_dir
 | 
			
		||||
            os.remove(cur_file)
 | 
			
		||||
            return
 | 
			
		||||
        sub_dirs = os.listdir(cur_dir)
 | 
			
		||||
        os.chdir(cur_dir)
 | 
			
		||||
        for sub_dir in sub_dirs:
 | 
			
		||||
            # don't traverse symlinks, not that there every should be any
 | 
			
		||||
            if not os.path.islink(sub_dir):
 | 
			
		||||
                self.cleanup_permissions(sub_dir)
 | 
			
		||||
        os.chdir("..")
 | 
			
		||||
        if has_no_folders(cur_dir):
 | 
			
		||||
            os.rmdir(cur_dir)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IngestDocCleanupMixin:
 | 
			
		||||
    read_config: ReadConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,7 @@
 | 
			
		||||
from .doc_factory import DocFactory
 | 
			
		||||
from .interfaces import PipelineContext, ReformatNode
 | 
			
		||||
from .partition import Partitioner
 | 
			
		||||
from .permissions import PermissionsDataCleaner
 | 
			
		||||
from .pipeline import Pipeline
 | 
			
		||||
from .reformat.chunking import Chunker
 | 
			
		||||
from .reformat.embedding import Embedder
 | 
			
		||||
@ -17,4 +18,5 @@ __all__ = [
 | 
			
		||||
    "Writer",
 | 
			
		||||
    "Chunker",
 | 
			
		||||
    "ReformatNode",
 | 
			
		||||
    "PermissionsDataCleaner",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
@ -212,3 +212,18 @@ class CopyNode(PipelineNode):
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def run(self, json_path: str):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class PermissionsNode(PipelineNode):
 | 
			
		||||
    """
 | 
			
		||||
    Encapsulated logic to do operations on permissions related data.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def initialize(self):
 | 
			
		||||
        logger.info("Running permissions node to cleanup the permissions folder")
 | 
			
		||||
        super().initialize()
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def run(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										12
									
								
								unstructured/ingest/pipeline/permissions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								unstructured/ingest/pipeline/permissions.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,12 @@
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
 | 
			
		||||
from unstructured.ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
 | 
			
		||||
from unstructured.ingest.pipeline.interfaces import PermissionsNode
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
 | 
			
		||||
    processor_config: ProcessorConfig
 | 
			
		||||
 | 
			
		||||
    def run(self):
 | 
			
		||||
        self.cleanup_permissions()
 | 
			
		||||
@ -15,6 +15,7 @@ from unstructured.ingest.pipeline.interfaces import (
 | 
			
		||||
    SourceNode,
 | 
			
		||||
    WriteNode,
 | 
			
		||||
)
 | 
			
		||||
from unstructured.ingest.pipeline.permissions import PermissionsDataCleaner
 | 
			
		||||
from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -26,6 +27,7 @@ class Pipeline(DataClassJsonMixin):
 | 
			
		||||
    partition_node: PartitionNode
 | 
			
		||||
    write_node: t.Optional[WriteNode] = None
 | 
			
		||||
    reformat_nodes: t.List[ReformatNode] = field(default_factory=list)
 | 
			
		||||
    permissions_node: t.Optional[PermissionsDataCleaner] = None
 | 
			
		||||
 | 
			
		||||
    def initialize(self):
 | 
			
		||||
        ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
 | 
			
		||||
@ -79,3 +81,6 @@ class Pipeline(DataClassJsonMixin):
 | 
			
		||||
 | 
			
		||||
        if self.write_node:
 | 
			
		||||
            self.write_node(iterable=partitioned_jsons)
 | 
			
		||||
 | 
			
		||||
        if self.permissions_node:
 | 
			
		||||
            self.permissions_node.cleanup_permissions()
 | 
			
		||||
 | 
			
		||||
@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import (
 | 
			
		||||
    ChunkingConfig,
 | 
			
		||||
    EmbeddingConfig,
 | 
			
		||||
    PartitionConfig,
 | 
			
		||||
    PermissionsConfig,
 | 
			
		||||
    ProcessorConfig,
 | 
			
		||||
)
 | 
			
		||||
from unstructured.ingest.pipeline import (
 | 
			
		||||
@ -15,6 +16,7 @@ from unstructured.ingest.pipeline import (
 | 
			
		||||
    DocFactory,
 | 
			
		||||
    Embedder,
 | 
			
		||||
    Partitioner,
 | 
			
		||||
    PermissionsDataCleaner,
 | 
			
		||||
    Pipeline,
 | 
			
		||||
    PipelineContext,
 | 
			
		||||
    Reader,
 | 
			
		||||
@ -33,6 +35,7 @@ def process_documents(
 | 
			
		||||
    dest_doc_connector: t.Optional[BaseDestinationConnector] = None,
 | 
			
		||||
    chunking_config: t.Optional[ChunkingConfig] = None,
 | 
			
		||||
    embedder_config: t.Optional[EmbeddingConfig] = None,
 | 
			
		||||
    permissions_config: t.Optional[PermissionsConfig] = None,
 | 
			
		||||
) -> None:
 | 
			
		||||
    pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
 | 
			
		||||
    doc_factory = DocFactory(
 | 
			
		||||
@ -64,6 +67,11 @@ def process_documents(
 | 
			
		||||
        if dest_doc_connector
 | 
			
		||||
        else None
 | 
			
		||||
    )
 | 
			
		||||
    permissions_data_cleaner = (
 | 
			
		||||
        PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config)
 | 
			
		||||
        if permissions_config
 | 
			
		||||
        else None
 | 
			
		||||
    )
 | 
			
		||||
    pipeline = Pipeline(
 | 
			
		||||
        pipeline_context=pipeline_config,
 | 
			
		||||
        doc_factory_node=doc_factory,
 | 
			
		||||
@ -71,5 +79,6 @@ def process_documents(
 | 
			
		||||
        partition_node=partitioner,
 | 
			
		||||
        reformat_nodes=reformat_nodes,
 | 
			
		||||
        write_node=writer,
 | 
			
		||||
        permissions_node=permissions_data_cleaner,
 | 
			
		||||
    )
 | 
			
		||||
    pipeline.run()
 | 
			
		||||
 | 
			
		||||
@ -8,6 +8,7 @@ from unstructured.ingest.interfaces import (
 | 
			
		||||
    ChunkingConfig,
 | 
			
		||||
    EmbeddingConfig,
 | 
			
		||||
    PartitionConfig,
 | 
			
		||||
    PermissionsConfig,
 | 
			
		||||
    ProcessorConfig,
 | 
			
		||||
    ReadConfig,
 | 
			
		||||
)
 | 
			
		||||
@ -24,6 +25,7 @@ class Runner(ABC):
 | 
			
		||||
    writer_kwargs: t.Optional[dict] = None
 | 
			
		||||
    embedding_config: t.Optional[EmbeddingConfig] = None
 | 
			
		||||
    chunking_config: t.Optional[ChunkingConfig] = None
 | 
			
		||||
    permissions_config: t.Optional[PermissionsConfig] = None
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def run(self, *args, **kwargs):
 | 
			
		||||
@ -36,6 +38,18 @@ class Runner(ABC):
 | 
			
		||||
            return writer(**writer_kwargs)
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def get_permissions_config(self) -> t.Optional[PermissionsConfig]:
 | 
			
		||||
        if self.permissions_config is None:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        permissions_config_filled = bool(
 | 
			
		||||
            self.permissions_config.application_id
 | 
			
		||||
            and self.permissions_config.client_cred
 | 
			
		||||
            and self.permissions_config.tenant,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        return self.permissions_config if permissions_config_filled else None
 | 
			
		||||
 | 
			
		||||
    def process_documents(self, source_doc_connector: BaseSourceConnector):
 | 
			
		||||
        process_documents(
 | 
			
		||||
            processor_config=self.processor_config,
 | 
			
		||||
@ -44,4 +58,5 @@ class Runner(ABC):
 | 
			
		||||
            dest_doc_connector=self.get_dest_doc_connector(),
 | 
			
		||||
            embedder_config=self.embedding_config,
 | 
			
		||||
            chunking_config=self.chunking_config,
 | 
			
		||||
            permissions_config=self.get_permissions_config(),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,6 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import typing as t
 | 
			
		||||
 | 
			
		||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
 | 
			
		||||
from unstructured.ingest.runner.base_runner import Runner
 | 
			
		||||
@ -12,6 +13,9 @@ class SharePointRunner(Runner):
 | 
			
		||||
        site: str,
 | 
			
		||||
        client_id: str,
 | 
			
		||||
        client_cred: str,
 | 
			
		||||
        permissions_application_id: t.Optional[str],
 | 
			
		||||
        permissions_client_cred: t.Optional[str],
 | 
			
		||||
        permissions_tenant: t.Optional[str],
 | 
			
		||||
        path: str,
 | 
			
		||||
        files_only: bool = False,
 | 
			
		||||
        recursive: bool = False,
 | 
			
		||||
@ -31,10 +35,17 @@ class SharePointRunner(Runner):
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        from unstructured.ingest.connector.sharepoint import (
 | 
			
		||||
            SharepointPermissionsConfig,
 | 
			
		||||
            SharepointSourceConnector,
 | 
			
		||||
            SimpleSharepointConfig,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        permissions_config = SharepointPermissionsConfig(
 | 
			
		||||
            application_id=permissions_application_id,
 | 
			
		||||
            client_cred=permissions_client_cred,
 | 
			
		||||
            tenant=permissions_tenant,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        source_doc_connector = SharepointSourceConnector(  # type: ignore
 | 
			
		||||
            processor_config=self.processor_config,
 | 
			
		||||
            connector_config=SimpleSharepointConfig(
 | 
			
		||||
@ -44,6 +55,7 @@ class SharePointRunner(Runner):
 | 
			
		||||
                path=path,
 | 
			
		||||
                process_pages=(not files_only),
 | 
			
		||||
                recursive=recursive,
 | 
			
		||||
                permissions_config=permissions_config,
 | 
			
		||||
            ),
 | 
			
		||||
            read_config=self.read_config,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user