feat: Sharepoint connector (#918)

This commit is contained in:
rvztz 2023-08-10 10:37:58 -06:00 committed by GitHub
parent ef5091f276
commit dee9b405cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 913 additions and 8 deletions

View File

@ -200,6 +200,9 @@ jobs:
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |

View File

@ -72,6 +72,9 @@ jobs:
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
OVERWRITE_FIXTURES: "true"

View File

@ -1,3 +1,14 @@
## 0.9.2-dev3
=======
### Enhancements
### Features
* Adds Sharepoint connector.
### Fixes
## 0.9.2-dev2
=======

1
examples/ingest/onedrive/ingest.sh Executable file → Normal file
View File

@ -26,6 +26,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--authority-url "<Authority URL, default is https://login.microsoftonline.com>" \
--tenant "<Azure AD tenant_id, default is 'common'>" \
--user-pname "<Azure AD principal name, in most cases is the email linked to the drive>" \
--path "<Path to start parsing files from>" \
--structured-output-dir onedrive-ingest-output \
--num-processes 2 \
--verbose

View File

@ -0,0 +1,28 @@
#!/usr/bin/env bash
# Processes the Unstructured-IO/unstructured repository
# through Unstructured's library in 2 processes.
# Structured outputs are stored in sharepoint-ingest-output/
# NOTE, this script is not ready-to-run!
# You must enter a MS Sharepoint app client-id, client secret and sharepoint site url
# before running.
# To get the credentials for your Sharepoint app, follow these steps:
# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
sharepoint \
--client-id "<Microsoft Sharepoint app client-id>" \
--client-cred "<Microsoft Sharepoint app client-secret>" \
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
--files-only "Flag to process only files within the site(s)" \
--structured-output-dir sharepoint-ingest-output \
--num-processes 2 \
--verbose

View File

@ -0,0 +1,6 @@
-c constraints.in
-c base.txt
msal==1.23.0
Office365-REST-Python-Client==2.4.2
pyjwt==2.8.0
cryptography==41.0.2

View File

@ -0,0 +1,50 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-sharepoint.in
#
certifi==2023.7.22
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via cryptography
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.2
# via
# -r requirements/ingest-sharepoint.in
# msal
# pyjwt
idna==3.4
# via
# -c requirements/base.txt
# requests
msal==1.23.0
# via
# -r requirements/ingest-sharepoint.in
# office365-rest-python-client
office365-rest-python-client==2.4.2
# via -r requirements/ingest-sharepoint.in
pycparser==2.21
# via cffi
pyjwt[crypto]==2.8.0
# via
# -r requirements/ingest-sharepoint.in
# msal
pytz==2023.3
# via office365-rest-python-client
requests==2.31.0
# via
# -c requirements/base.txt
# msal
# office365-rest-python-client
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests

View File

@ -0,0 +1,110 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
"server_relative_url": "/Shared Documents/fake-text.txt"
},
"date_created": "2023-06-16T05:04:55Z",
"date_modified": "2023-06-16T05:04:55Z"
},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
"server_relative_url": "/Shared Documents/fake-text.txt"
},
"date_created": "2023-06-16T05:04:55Z",
"date_modified": "2023-06-16T05:04:55Z"
},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
"server_relative_url": "/Shared Documents/fake-text.txt"
},
"date_created": "2023-06-16T05:04:55Z",
"date_modified": "2023-06-16T05:04:55Z"
},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
"server_relative_url": "/Shared Documents/fake-text.txt"
},
"date_created": "2023-06-16T05:04:55Z",
"date_modified": "2023-06-16T05:04:55Z"
},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
"server_relative_url": "/Shared Documents/fake-text.txt"
},
"date_created": "2023-06-16T05:04:55Z",
"date_modified": "2023-06-16T05:04:55Z"
},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
"server_relative_url": "/Shared Documents/fake-text.txt"
},
"date_created": "2023-06-16T05:04:55Z",
"date_modified": "2023-06-16T05:04:55Z"
},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,37 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "0dfe3d76-00c0-42db-ae1b-8cf22d4b3f10",
"server_relative_url": "/Shared Documents/ideas-page.html"
},
"date_created": "2023-06-16T05:04:47Z",
"date_modified": "2023-06-16T05:04:47Z"
},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,44 @@
[
{
"type": "Table",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "b9956a33-8079-4321-91ea-609def07394d",
"server_relative_url": "/Shared Documents/stanley-cups.xlsx"
},
"date_created": "2023-06-16T05:05:05Z",
"date_modified": "2023-06-16T05:05:05Z"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"metadata": {
"data_source": {
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "b9956a33-8079-4321-91ea-609def07394d",
"server_relative_url": "/Shared Documents/stanley-cups.xlsx"
},
"date_created": "2023-06-16T05:05:05Z",
"date_modified": "2023-06-16T05:05:05Z"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]

View File

@ -0,0 +1,42 @@
[
{
"type": "Title",
"element_id": "b4e929d8bcfe04189801a8ed61496d17",
"metadata": {
"data_source": {
"version": "1.2",
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10",
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx"
},
"date_created": "0001-01-01T08:00:00Z",
"date_modified": "2023-06-16T05:12:51Z"
},
"filename": "Home.html",
"filetype": "text/html",
"page_number": 1
},
"text": "Documents"
},
{
"type": "Title",
"element_id": "8d14f6e72de8f18ab1ee5c5330f00653",
"metadata": {
"data_source": {
"version": "1.2",
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10",
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx"
},
"date_created": "0001-01-01T08:00:00Z",
"date_modified": "2023-06-16T05:12:51Z"
},
"filename": "Home.html",
"filetype": "text/html",
"page_number": 1
},
"text": "Events"
}
]

View File

@ -0,0 +1,82 @@
[
{
"type": "ListItem",
"element_id": "54bdbe8a7a031cf41a7f99cf3a27b8ff",
"metadata": {
"data_source": {
"version": "1.0",
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
},
"date_created": "0001-01-01T08:00:00Z",
"date_modified": "2023-07-31T07:03:37Z"
},
"filename": "This-is-a-title.html",
"filetype": "text/html",
"page_number": 1
},
"text": "This is a plain text site page for testing purposes"
},
{
"type": "ListItem",
"element_id": "7499f3d6c2534c6017c1c6e08406640f",
"metadata": {
"data_source": {
"version": "1.0",
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
},
"date_created": "0001-01-01T08:00:00Z",
"date_modified": "2023-07-31T07:03:37Z"
},
"filename": "This-is-a-title.html",
"filetype": "text/html",
"page_number": 1
},
"text": "These are bullet points meant for testing"
},
{
"type": "NarrativeText",
"element_id": "3d8a9d73a6fae35d8fd19f8e82578fa5",
"metadata": {
"data_source": {
"version": "1.0",
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
},
"date_created": "0001-01-01T08:00:00Z",
"date_modified": "2023-07-31T07:03:37Z"
},
"filename": "This-is-a-title.html",
"filetype": "text/html",
"page_number": 1
},
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam ex tellus, sodales non nulla et, sodales consequat turpis. Etiam vestibulum nisl placerat risus elementum, a sodales purus rhoncus. Sed eget velit pharetra, pretium nisi nec, laoreet ligula. Duis luctus mi in ligula cursus, vel lacinia tortor ultricies. Aenean sit amet sodales odio, a maximus elit. Pellentesque vehicula diam sit amet leo placerat placerat. Integer varius elementum accumsan. Donec posuere elit mauris, eget efficitur nisl viverra vitae."
},
{
"type": "NarrativeText",
"element_id": "27f6715881d63c1795b3c7e17b20090a",
"metadata": {
"data_source": {
"version": "1.0",
"record_locator": {
"site": "https://unstructuredio.sharepoint.com/",
"unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
},
"date_created": "0001-01-01T08:00:00Z",
"date_modified": "2023-07-31T07:03:37Z"
},
"filename": "This-is-a-title.html",
"filetype": "text/html",
"page_number": 1
},
"text": "Integer at dictum nisi. Cras venenatis non velit in posuere. Curabitur tristique, eros eget tristique pellentesque, neque metus ullamcorper ligula, nec posuere neque lacus nec felis. Nulla a libero eget eros consectetur hendrerit. Pellentesque interdum, diam eget tristique pretium, quam lorem pulvinar lorem, a eleifend nisl lectus at ex. Praesent pulvinar ex ut consequat condimentum. Sed rutrum, erat a hendrerit blandit, urna mauris posuere est, at porttitor risus diam non leo. Nullam rutrum vehicula dolor, quis venenatis ligula rutrum sit amet. Nam massa justo, fermentum in dui lacinia, tincidunt imperdiet nunc. Nam posuere tortor ac lectus elementum, non mollis urna consequat. In interdum non tellus sed pellentesque."
}
]

View File

@ -27,7 +27,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--client-id "$MS_CLIENT_ID" \
--tenant "$MS_TENANT_ID" \
--user-pname "$MS_USER_PNAME" \
--onedrive-folder '/utic-test-ingest-fixtures' \
--path '/utic-test-ingest-fixtures' \
--recursive \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=Sharepoint
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
echo "Skipping Sharepoint ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED env var is not set."
exit 0
fi
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly
PYTHONPATH=. ./unstructured/ingest/main.py \
sharepoint \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified \
--num-processes 2 \
--partition-strategy hi_res \
--preserve-downloads \
--reprocess \
--structured-output-dir "$OUTPUT_DIR" \
--verbose \
--client-cred "$SHAREPOINT_CRED" \
--client-id "$SHAREPOINT_CLIENT_ID" \
--site "$SHAREPOINT_SITE" \
--path "Shared Documents" \
--recursive \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -26,10 +26,11 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-onedrive.sh
./test_unstructured_ingest/test-ingest-outlook.sh
./test_unstructured_ingest/test-ingest-elasticsearch.sh
./test_unstructured_ingest/test-ingest-confluence-diff.sh
#./test_unstructured_ingest/test-ingest-confluence-diff.sh
./test_unstructured_ingest/test-ingest-confluence-large.sh
./test_unstructured_ingest/test-ingest-local-single-file.sh
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
./test_unstructured_ingest/test-ingest-sharepoint.sh

View File

@ -1 +1 @@
__version__ = "0.9.2-dev2" # pragma: no cover
__version__ = "0.9.2-dev3" # pragma: no cover

View File

@ -30,6 +30,7 @@ subcommands = [
cli_cmds.local,
cli_cmds.elasticsearch,
cli_cmds.confluence,
cli_cmds.sharepoint,
]
for subcommand in subcommands:

View File

@ -16,6 +16,7 @@ from .onedrive import get_cmd as onedrive
from .outlook import get_cmd as outlook
from .reddit import get_cmd as reddit
from .s3 import get_cmd as s3
from .sharepoint import get_cmd as sharepoint
from .slack import get_cmd as slack
from .wikipedia import get_cmd as wikipedia
@ -38,6 +39,7 @@ __all__ = [
"outlook",
"reddit",
"s3",
"sharepoint",
"slack",
"wikipedia",
]

View File

@ -32,7 +32,7 @@ from unstructured.ingest.runner import onedrive as onedrive_fn
help="Microsoft App client secret",
)
@click.option(
"--onedrive-folder",
"--path",
default=None,
help="Folder to start parsing files from.",
)

View File

@ -0,0 +1,72 @@
import logging
import click
from unstructured.ingest.cli.common import (
add_recursive_option,
add_shared_options,
log_options,
map_to_processor_config,
map_to_standard_config,
run_init_checks,
)
from unstructured.ingest.logger import ingest_log_streaming_init, logger
from unstructured.ingest.runner import sharepoint as sharepoint_fn
@click.command()
@click.option(
"--client-id",
default=None,
help="Sharepoint app client ID",
)
@click.option(
"--client-cred",
default=None,
help="Sharepoint app secret",
)
@click.option(
"--site",
default=None,
help="Sharepoint site url. Process either base url e.g https://[tenant].sharepoint.com \
or relative sites https://[tenant].sharepoint.com/sites/<site_name>.\
To process all sites within the tenant pass a site url as\
https://[tenant]-admin.sharepoint.com.\
This requires the app to be registered at a tenant level",
)
@click.option(
"--path",
default="Shared Documents",
help="Path from which to start parsing files. If the connector is to process all sites \
within the tenant this filter will be applied to all sites document libraries. \
Default 'Shared Documents'",
)
@click.option(
"--files-only",
is_flag=True,
default=False,
help="Process only files.",
)
def sharepoint(**options):
verbose = options.get("verbose", False)
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
log_options(options)
try:
run_init_checks(**options)
connector_config = map_to_standard_config(options)
processor_config = map_to_processor_config(options)
sharepoint_fn(
connector_config=connector_config,
processor_config=processor_config,
**options,
)
except Exception as e:
logger.error(e, exc_info=True)
raise click.ClickException(str(e)) from e
def get_cmd() -> click.Command:
cmd = sharepoint
add_recursive_option(cmd)
add_shared_options(cmd)
return cmd

View File

@ -27,7 +27,7 @@ class SimpleOneDriveConfig(BaseConnectorConfig):
user_pname: str
tenant: str = field(repr=False)
authority_url: Optional[str] = field(repr=False)
folder: Optional[str] = field(default="")
path: Optional[str] = field(default="")
recursive: bool = False
def __post_init__(self):
@ -150,7 +150,7 @@ class OneDriveConnector(ConnectorCleanupMixin, BaseConnector):
def get_ingest_docs(self):
root = self.client.users[self.config.user_pname].drive.get().execute_query().root
if fpath := self.config.folder:
if fpath := self.config.path:
root = root.get_by_path(fpath).get().execute_query()
if root is None or not root.is_folder:
raise ValueError(f"Unable to find directory, given: {fpath}")

View File

@ -0,0 +1,328 @@
from dataclasses import dataclass, field
from html import unescape
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from urllib.parse import urlparse
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
BaseIngestDoc,
ConnectorCleanupMixin,
IngestDocCleanupMixin,
StandardConnectorConfig,
)
from unstructured.ingest.logger import logger
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from office365.sharepoint.files.file import File
MAX_MB_SIZE = 512_000_000
@dataclass
class SimpleSharepointConfig(BaseConnectorConfig):
client_id: str
client_credential: str = field(repr=False)
site_url: str
path: str
process_pages: bool = False
recursive: bool = False
def __post_init__(self):
if not (self.client_id and self.client_credential and self.site_url):
raise ValueError(
"Please provide one of the following mandatory values:"
"\n--client-id\n--client-cred\n--site",
)
@dataclass
class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
config: SimpleSharepointConfig
file: "File"
meta: dict
def __post_init__(self):
self.ext = "".join(Path(self.file.name).suffixes) if not self.meta else ".html"
self.ext = self.ext if self.ext != ".aspx" else ".html"
if not self.ext:
raise ValueError("Unsupported file without extension.")
if self.ext not in EXT_TO_FILETYPE:
raise ValueError(
f"Extension {self.ext} not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
)
self._set_download_paths()
def _set_download_paths(self) -> None:
"""Parses the folder structure from the source and creates the download and output paths"""
download_path = Path(f"{self.standard_config.download_dir}")
output_path = Path(f"{self.standard_config.output_dir}")
if self.meta:
page_url = self.meta["page"].get_property("Url", "")
parent = (
Path(page_url).with_suffix(self.ext)
if (self.meta["site_path"] is None)
else Path(self.meta["site_path"] + "/" + page_url).with_suffix(self.ext)
)
else:
parent = Path(self.file.serverRelativeUrl[1:])
self.download_dir = (download_path / parent.parent).resolve()
self.download_filepath = (download_path / parent).resolve()
oname = f"{str(parent)[:-len(self.ext)]}.json"
self.output_dir = (output_path / parent.parent).resolve()
self.output_filepath = (output_path / oname).resolve()
@property
def filename(self):
return Path(self.download_filepath).resolve()
@property
def _output_filename(self):
return Path(self.output_filepath).resolve()
@property
def date_created(self) -> Optional[str]:
if self.meta:
return self.meta["page"].properties.get("FirstPublished", None)
return self.file.time_created
@property
def date_modified(self) -> Optional[str]:
if self.meta:
return self.meta["page"].properties.get("Modified", None)
return self.file.time_last_modified
@property
def exists(self) -> Optional[bool]:
if self.meta:
return self.meta["page"].properties.get("FileName", None) and self.meta[
"page"
].properties.get("UniqueId", None)
return self.file.exists
@property
def record_locator(self) -> Optional[Dict[str, Any]]:
if self.meta:
record_source = self.meta["page"]
property_name = "AbsoluteUrl"
resource_url_name = "absolute_url"
else:
record_source = self.file
property_name = "ServerRelativeUrl"
resource_url_name = "server_relative_url"
return {
"site": self.config.site_url,
"unique_id": record_source.get_property("UniqueId", ""),
resource_url_name: record_source.get_property(property_name, ""),
}
@property
def version(self) -> Optional[str]:
if self.meta:
return self.meta["page"].properties.get("Version", "")
if (n_versions := len(self.file.versions)) > 0:
return self.file.versions[n_versions - 1].properties.get("id", None)
return None
def _get_page(self):
"""Retrieves HTML content of the Sharepoint site through the CanvasContent1 and
LayoutWebpartsContent1"""
try:
content_labels = ["CanvasContent1", "LayoutWebpartsContent1"]
content = self.file.listItemAllFields.select(content_labels).get().execute_query()
pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + (
content.properties.get("CanvasContent1", "") or ""
)
if pld != "":
pld = unescape(pld)
else:
logger.info(
f"Page {self.meta['page'].get_property('Url', '')} has no retrievable content. \
Dumping empty doc.",
)
pld = "<div></div>"
self.output_dir.mkdir(parents=True, exist_ok=True)
if not self.download_dir.is_dir():
logger.debug(f"Creating directory: {self.download_dir}")
self.download_dir.mkdir(parents=True, exist_ok=True)
with self.filename.open(mode="w") as f:
f.write(pld)
except Exception as e:
logger.error(f"Error while downloading and saving file: {self.filename}.")
logger.error(e)
return
logger.info(f"File downloaded: {self.filename}")
def _get_file(self):
try:
fsize = self.file.length
self.output_dir.mkdir(parents=True, exist_ok=True)
if not self.download_dir.is_dir():
logger.debug(f"Creating directory: {self.download_dir}")
self.download_dir.mkdir(parents=True, exist_ok=True)
if fsize > MAX_MB_SIZE:
logger.info(f"Downloading file with size: {fsize} bytes in chunks")
with self.filename.open(mode="wb") as f:
self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
else:
with self.filename.open(mode="wb") as f:
self.file.download(f).execute_query()
except Exception as e:
logger.error(f"Error while downloading and saving file: {self.filename}.")
logger.error(e)
return
logger.info(f"File downloaded: {self.filename}")
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["office365"])
def get_file(self):
if not self.meta:
self._get_file()
else:
self._get_page()
return
class SharepointConnector(ConnectorCleanupMixin, BaseConnector):
config: SimpleSharepointConfig
tenant: None
def __init__(self, standard_config: StandardConnectorConfig, config: SimpleSharepointConfig):
super().__init__(standard_config, config)
self._setup_client()
@requires_dependencies(["office365"])
def _setup_client(self):
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.client_context import ClientContext
parsed_url = urlparse(self.config.site_url)
site_hostname = (parsed_url.hostname or "").split(".")
tenant_url = site_hostname[0].split("-")
self.process_all = False
self.base_site_url = ""
if tenant_url[-1] == "admin" and (parsed_url.path is None or parsed_url.path == "/"):
self.process_all = True
self.base_site_url = parsed_url._replace(
netloc=parsed_url.netloc.replace(site_hostname[0], tenant_url[0]),
).geturl()
elif tenant_url[-1] == "admin":
raise ValueError(
"A site url in the form of https://[tenant]-admin.sharepoint.com \
is required to process all sites within a tenant. ",
)
self.client = ClientContext(self.config.site_url).with_credentials(
ClientCredential(self.config.client_id, self.config.client_credential),
)
@requires_dependencies(["office365"])
def _list_files(self, folder, recursive) -> List["File"]:
from office365.runtime.client_request_exception import ClientRequestException
try:
objects = folder.expand(["Files", "Folders"]).get().execute_query()
files = list(objects.files)
if not recursive:
return files
for f in objects.folders:
if "/Forms" in f.serverRelativeUrl:
continue
files += self._list_files(f, recursive)
return files
except ClientRequestException as e:
if e.response.status_code != 404:
logger.info("Caught an error while processing documents %s", e.response.text)
return []
@requires_dependencies(["office365"])
def _list_pages(self, site_client) -> list:
from office365.runtime.client_request_exception import ClientRequestException
try:
pages = site_client.site_pages.pages.get().execute_query()
page_files = []
for page_meta in pages:
page_url = page_meta.get_property("Url", None)
if page_url is None:
logger.info("Missing site_url. Omitting page... ")
break
page_url = f"/{page_url}" if page_url[0] != "/" else page_url
file_page = site_client.web.get_file_by_server_relative_path(page_url)
site_path = None
if (url_path := (urlparse(site_client.base_url).path)) and (url_path != "/"):
site_path = url_path[1:]
page_files.append(
[file_page, {"page": page_meta, "site_path": site_path}],
)
except ClientRequestException as e:
logger.info("Caught an error while processing pages %s", e.response.text)
return []
return page_files
def initialize(self):
pass
def _ingest_site_docs(self, site_client) -> List["SharepointIngestDoc"]:
root_folder = site_client.web.get_folder_by_server_relative_path(self.config.path)
files = self._list_files(root_folder, self.config.recursive)
if not files:
logger.info(
f"Couldn't process files in path {self.config.path} \
for site {site_client.base_url}",
)
output = [SharepointIngestDoc(self.standard_config, self.config, f, {}) for f in files]
if self.config.process_pages:
page_files = self._list_pages(site_client)
if not page_files:
logger.info(f"Couldn't process pages for site {site_client.base_url}")
page_output = [
SharepointIngestDoc(self.standard_config, self.config, f[0], f[1])
for f in page_files
]
output = output + page_output
return output
def _filter_site_url(self, site):
if site.url is None:
return False
return (site.url[0 : len(self.base_site_url)] == self.base_site_url) and ( # noqa: E203
"/sites/" in site.url
)
@requires_dependencies(["office365"])
def get_ingest_docs(self):
if self.process_all:
logger.debug(self.base_site_url)
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.tenant.administration.tenant import Tenant
tenant = Tenant(self.client)
tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query()
tenant_sites = [s.url for s in tenant_sites if self._filter_site_url(s)]
tenant_sites.append(self.base_site_url)
ingest_docs: List[SharepointIngestDoc] = []
for site_url in set(tenant_sites):
logger.info(f"Processing docs for site: {site_url}")
site_client = ClientContext(site_url).with_credentials(
ClientCredential(self.config.client_id, self.config.client_credential),
)
ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
return ingest_docs
else:
return self._ingest_site_docs(self.client)

View File

@ -16,6 +16,7 @@ from .onedrive import onedrive
from .outlook import outlook
from .reddit import reddit
from .s3 import s3
from .sharepoint import sharepoint
from .slack import slack
from .wikipedia import wikipedia
@ -38,6 +39,7 @@ __all__ = [
"outlook",
"reddit",
"s3",
"sharepoint",
"slack",
"wikipedia",
]

View File

@ -17,7 +17,7 @@ def onedrive(
client_id: str,
client_cred: str,
authority_url: Optional[str],
onedrive_folder: Optional[str],
path: Optional[str],
recursive: bool,
**kwargs,
):
@ -45,7 +45,7 @@ def onedrive(
user_pname=user_pname,
tenant=tenant,
authority_url=authority_url,
folder=onedrive_folder,
path=path,
recursive=recursive,
),
)

View File

@ -0,0 +1,50 @@
import hashlib
import logging
from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig
from unstructured.ingest.logger import ingest_log_streaming_init, logger
from unstructured.ingest.processor import process_documents
from unstructured.ingest.runner.utils import update_download_dir_hash
def sharepoint(
verbose: bool,
connector_config: StandardConnectorConfig,
processor_config: ProcessorConfigs,
site: str,
client_id: str,
client_cred: str,
files_only: bool,
path: str,
recursive: bool,
**kwargs,
):
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
hashed_dir_name = hashlib.sha256(
f"{site}_{path}".encode("utf-8"),
)
connector_config.download_dir = update_download_dir_hash(
connector_config=connector_config,
hashed_dir_name=hashed_dir_name,
logger=logger,
)
from unstructured.ingest.connector.sharepoint import (
SharepointConnector,
SimpleSharepointConfig,
)
doc_connector = SharepointConnector( # type: ignore
standard_config=connector_config,
config=SimpleSharepointConfig(
client_id=client_id,
client_credential=client_cred,
site_url=site,
path=path,
process_pages=(not files_only),
recursive=recursive,
),
)
process_documents(doc_connector=doc_connector, processor_config=processor_config)