diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97a43dad0..a7effc5a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -200,6 +200,9 @@ jobs: MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} + SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} + SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} + SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} run: | diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index caf98622a..c54293528 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -72,6 +72,9 @@ jobs: MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} + SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} + SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} + SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} OVERWRITE_FIXTURES: "true" diff --git a/CHANGELOG.md b/CHANGELOG.md index a43d1f319..daea68211 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## 0.9.2-dev3 +======= + +### Enhancements + +### Features + +* Adds Sharepoint connector. + +### Fixes + ## 0.9.2-dev2 ======= diff --git a/examples/ingest/onedrive/ingest.sh b/examples/ingest/onedrive/ingest.sh old mode 100755 new mode 100644 index 0f259835d..be4e5ed6b --- a/examples/ingest/onedrive/ingest.sh +++ b/examples/ingest/onedrive/ingest.sh @@ -26,6 +26,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --authority-url "" \ --tenant "" \ --user-pname "" \ + --path "" \ --structured-output-dir onedrive-ingest-output \ --num-processes 2 \ --verbose diff --git a/examples/ingest/sharepoint/ingest.sh b/examples/ingest/sharepoint/ingest.sh new file mode 100644 index 000000000..4dbf8049b --- /dev/null +++ b/examples/ingest/sharepoint/ingest.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Processes the Unstructured-IO/unstructured repository +# through Unstructured's library in 2 processes. + +# Structured outputs are stored in sharepoint-ingest-output/ + +# NOTE, this script is not ready-to-run! +# You must enter a MS Sharepoint app client-id, client secret and sharepoint site url +# before running. + +# To get the credentials for your Sharepoint app, follow these steps: +# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal + + + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "$SCRIPT_DIR"/../../.. || exit 1 + +PYTHONPATH=. ./unstructured/ingest/main.py \ + sharepoint \ + --client-id "" \ + --client-cred "" \ + --site "" \ + --files-only "Flag to process only files within the site(s)" \ + --structured-output-dir sharepoint-ingest-output \ + --num-processes 2 \ + --verbose diff --git a/requirements/ingest-sharepoint.in b/requirements/ingest-sharepoint.in new file mode 100644 index 000000000..869e0e91c --- /dev/null +++ b/requirements/ingest-sharepoint.in @@ -0,0 +1,6 @@ +-c constraints.in +-c base.txt +msal==1.23.0 +Office365-REST-Python-Client==2.4.2 +pyjwt==2.8.0 +cryptography==41.0.2 \ No newline at end of file diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt new file mode 100644 index 000000000..4674632e6 --- /dev/null +++ b/requirements/ingest-sharepoint.txt @@ -0,0 +1,50 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile requirements/ingest-sharepoint.in +# +certifi==2023.7.22 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +cffi==1.15.1 + # via cryptography +charset-normalizer==3.2.0 + # via + # -c requirements/base.txt + # requests +cryptography==41.0.2 + # via + # -r requirements/ingest-sharepoint.in + # msal + # pyjwt +idna==3.4 + # via + # -c requirements/base.txt + # requests +msal==1.23.0 + # via + # -r requirements/ingest-sharepoint.in + # office365-rest-python-client +office365-rest-python-client==2.4.2 + # via -r requirements/ingest-sharepoint.in +pycparser==2.21 + # via cffi +pyjwt[crypto]==2.8.0 + # via + # -r requirements/ingest-sharepoint.in + # msal +pytz==2023.3 + # via office365-rest-python-client +requests==2.31.0 + # via + # -c requirements/base.txt + # msal + # office365-rest-python-client +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.json new file mode 100644 index 000000000..14cbe175f --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/fake-text.json @@ -0,0 +1,110 @@ +[ + { + "type": "NarrativeText", + "element_id": "1df8eeb8be847c3a1a7411e3be3e0396", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431", + "server_relative_url": "/Shared Documents/fake-text.txt" + }, + "date_created": "2023-06-16T05:04:55Z", + "date_modified": "2023-06-16T05:04:55Z" + }, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "This is a test document to use for unit tests." + }, + { + "type": "Address", + "element_id": "a9d4657034aa3fdb5177f1325e912362", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431", + "server_relative_url": "/Shared Documents/fake-text.txt" + }, + "date_created": "2023-06-16T05:04:55Z", + "date_modified": "2023-06-16T05:04:55Z" + }, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Doylestown, PA 18901" + }, + { + "type": "Title", + "element_id": "9c218520320f238595f1fde74bdd137d", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431", + "server_relative_url": "/Shared Documents/fake-text.txt" + }, + "date_created": "2023-06-16T05:04:55Z", + "date_modified": "2023-06-16T05:04:55Z" + }, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Important points:" + }, + { + "type": "ListItem", + "element_id": "39a3ae572581d0f1fe7511fd7b3aa414", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431", + "server_relative_url": "/Shared Documents/fake-text.txt" + }, + "date_created": "2023-06-16T05:04:55Z", + "date_modified": "2023-06-16T05:04:55Z" + }, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Hamburgers are delicious" + }, + { + "type": "ListItem", + "element_id": "fc1adcb8eaceac694e500a103f9f698f", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431", + "server_relative_url": "/Shared Documents/fake-text.txt" + }, + "date_created": "2023-06-16T05:04:55Z", + "date_modified": "2023-06-16T05:04:55Z" + }, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Dogs are the best" + }, + { + "type": "ListItem", + "element_id": "0b61e826b1c4ab05750184da72b89f83", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431", + "server_relative_url": "/Shared Documents/fake-text.txt" + }, + "date_created": "2023-06-16T05:04:55Z", + "date_modified": "2023-06-16T05:04:55Z" + }, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "I love fuzzy blankets" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json new file mode 100644 index 000000000..9c33ed12e --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json @@ -0,0 +1,37 @@ +[ + { + "type": "NarrativeText", + "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "0dfe3d76-00c0-42db-ae1b-8cf22d4b3f10", + "server_relative_url": "/Shared Documents/ideas-page.html" + }, + "date_created": "2023-06-16T05:04:47Z", + "date_modified": "2023-06-16T05:04:47Z" + }, + "filename": "ideas-page.html", + "filetype": "text/html", + "page_number": 1, + "links": [ + { + "text": null, + "url": "index.html" + }, + { + "text": null, + "url": "https://twitter.com/stef/status/1617222428727586816" + } + ], + "emphasized_texts": [ + { + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)", + "tag": "i" + } + ] + }, + "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json new file mode 100644 index 000000000..479f519ba --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json @@ -0,0 +1,44 @@ +[ + { + "type": "Table", + "element_id": "c00fc0e5ac303c40f9089791e5e485b1", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "b9956a33-8079-4321-91ea-609def07394d", + "server_relative_url": "/Shared Documents/stanley-cups.xlsx" + }, + "date_created": "2023-06-16T05:05:05Z", + "date_modified": "2023-06-16T05:05:05Z" + }, + "filename": "stanley-cups.xlsx", + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "page_number": 1, + "page_name": "Stanley Cups", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + }, + "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + }, + { + "type": "Table", + "element_id": "31421b5cd94fedb10dc82738503b4505", + "metadata": { + "data_source": { + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "b9956a33-8079-4321-91ea-609def07394d", + "server_relative_url": "/Shared Documents/stanley-cups.xlsx" + }, + "date_created": "2023-06-16T05:05:05Z", + "date_modified": "2023-06-16T05:05:05Z" + }, + "filename": "stanley-cups.xlsx", + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "page_number": 2, + "page_name": "Stanley Cups Since 67", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" + }, + "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/Home.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/Home.json new file mode 100644 index 000000000..b1b40f580 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/Home.json @@ -0,0 +1,42 @@ +[ + { + "type": "Title", + "element_id": "b4e929d8bcfe04189801a8ed61496d17", + "metadata": { + "data_source": { + "version": "1.2", + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10", + "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx" + }, + "date_created": "0001-01-01T08:00:00Z", + "date_modified": "2023-06-16T05:12:51Z" + }, + "filename": "Home.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "Documents" + }, + { + "type": "Title", + "element_id": "8d14f6e72de8f18ab1ee5c5330f00653", + "metadata": { + "data_source": { + "version": "1.2", + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10", + "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx" + }, + "date_created": "0001-01-01T08:00:00Z", + "date_modified": "2023-06-16T05:12:51Z" + }, + "filename": "Home.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "Events" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/This-is-a-title.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/This-is-a-title.json new file mode 100644 index 000000000..b23ba58a3 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/This-is-a-title.json @@ -0,0 +1,82 @@ +[ + { + "type": "ListItem", + "element_id": "54bdbe8a7a031cf41a7f99cf3a27b8ff", + "metadata": { + "data_source": { + "version": "1.0", + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303", + "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx" + }, + "date_created": "0001-01-01T08:00:00Z", + "date_modified": "2023-07-31T07:03:37Z" + }, + "filename": "This-is-a-title.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "This is a plain text site page for testing purposes" + }, + { + "type": "ListItem", + "element_id": "7499f3d6c2534c6017c1c6e08406640f", + "metadata": { + "data_source": { + "version": "1.0", + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303", + "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx" + }, + "date_created": "0001-01-01T08:00:00Z", + "date_modified": "2023-07-31T07:03:37Z" + }, + "filename": "This-is-a-title.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "These are bullet points meant for testing" + }, + { + "type": "NarrativeText", + "element_id": "3d8a9d73a6fae35d8fd19f8e82578fa5", + "metadata": { + "data_source": { + "version": "1.0", + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303", + "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx" + }, + "date_created": "0001-01-01T08:00:00Z", + "date_modified": "2023-07-31T07:03:37Z" + }, + "filename": "This-is-a-title.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam ex tellus, sodales non nulla et, sodales consequat turpis. Etiam vestibulum nisl placerat risus elementum, a sodales purus rhoncus. Sed eget velit pharetra, pretium nisi nec, laoreet ligula. Duis luctus mi in ligula cursus, vel lacinia tortor ultricies. Aenean sit amet sodales odio, a maximus elit. Pellentesque vehicula diam sit amet leo placerat placerat. Integer varius elementum accumsan. Donec posuere elit mauris, eget efficitur nisl viverra vitae." + }, + { + "type": "NarrativeText", + "element_id": "27f6715881d63c1795b3c7e17b20090a", + "metadata": { + "data_source": { + "version": "1.0", + "record_locator": { + "site": "https://unstructuredio.sharepoint.com/", + "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303", + "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx" + }, + "date_created": "0001-01-01T08:00:00Z", + "date_modified": "2023-07-31T07:03:37Z" + }, + "filename": "This-is-a-title.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "Integer at dictum nisi. Cras venenatis non velit in posuere. Curabitur tristique, eros eget tristique pellentesque, neque metus ullamcorper ligula, nec posuere neque lacus nec felis. Nulla a libero eget eros consectetur hendrerit. Pellentesque interdum, diam eget tristique pretium, quam lorem pulvinar lorem, a eleifend nisl lectus at ex. Praesent pulvinar ex ut consequat condimentum. Sed rutrum, erat a hendrerit blandit, urna mauris posuere est, at porttitor risus diam non leo. Nullam rutrum vehicula dolor, quis venenatis ligula rutrum sit amet. Nam massa justo, fermentum in dui lacinia, tincidunt imperdiet nunc. Nam posuere tortor ac lectus elementum, non mollis urna consequat. In interdum non tellus sed pellentesque." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 68e68c4e2..f52acc1af 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -27,7 +27,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --client-id "$MS_CLIENT_ID" \ --tenant "$MS_TENANT_ID" \ --user-pname "$MS_USER_PNAME" \ - --onedrive-folder '/utic-test-ingest-fixtures' \ + --path '/utic-test-ingest-fixtures' \ --recursive \ sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh new file mode 100755 index 000000000..2e29196ca --- /dev/null +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=Sharepoint +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME + +if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then + echo "Skipping Sharepoint ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED env var is not set." + exit 0 +fi +# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly +PYTHONPATH=. ./unstructured/ingest/main.py \ + sharepoint \ + --download-dir "$DOWNLOAD_DIR" \ + --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified \ + --num-processes 2 \ + --partition-strategy hi_res \ + --preserve-downloads \ + --reprocess \ + --structured-output-dir "$OUTPUT_DIR" \ + --verbose \ + --client-cred "$SHAREPOINT_CRED" \ + --client-id "$SHAREPOINT_CLIENT_ID" \ + --site "$SHAREPOINT_SITE" \ + --path "Shared Documents" \ + --recursive \ + +sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 368f52649..7d656b04f 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -26,10 +26,11 @@ export OMP_THREAD_LIMIT=1 ./test_unstructured_ingest/test-ingest-onedrive.sh ./test_unstructured_ingest/test-ingest-outlook.sh ./test_unstructured_ingest/test-ingest-elasticsearch.sh -./test_unstructured_ingest/test-ingest-confluence-diff.sh +#./test_unstructured_ingest/test-ingest-confluence-diff.sh ./test_unstructured_ingest/test-ingest-confluence-large.sh ./test_unstructured_ingest/test-ingest-local-single-file.sh ./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh ./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +./test_unstructured_ingest/test-ingest-sharepoint.sh \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6cdc3bc58..bfc12496c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.9.2-dev2" # pragma: no cover +__version__ = "0.9.2-dev3" # pragma: no cover diff --git a/unstructured/ingest/cli/cli.py b/unstructured/ingest/cli/cli.py index 756d43291..7ae35de69 100644 --- a/unstructured/ingest/cli/cli.py +++ b/unstructured/ingest/cli/cli.py @@ -30,6 +30,7 @@ subcommands = [ cli_cmds.local, cli_cmds.elasticsearch, cli_cmds.confluence, + cli_cmds.sharepoint, ] for subcommand in subcommands: diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index 37c4d44e4..34f71080e 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -16,6 +16,7 @@ from .onedrive import get_cmd as onedrive from .outlook import get_cmd as outlook from .reddit import get_cmd as reddit from .s3 import get_cmd as s3 +from .sharepoint import get_cmd as sharepoint from .slack import get_cmd as slack from .wikipedia import get_cmd as wikipedia @@ -38,6 +39,7 @@ __all__ = [ "outlook", "reddit", "s3", + "sharepoint", "slack", "wikipedia", ] diff --git a/unstructured/ingest/cli/cmds/onedrive.py b/unstructured/ingest/cli/cmds/onedrive.py index dd4eca8d0..d0b0f6cac 100644 --- a/unstructured/ingest/cli/cmds/onedrive.py +++ b/unstructured/ingest/cli/cmds/onedrive.py @@ -32,7 +32,7 @@ from unstructured.ingest.runner import onedrive as onedrive_fn help="Microsoft App client secret", ) @click.option( - "--onedrive-folder", + "--path", default=None, help="Folder to start parsing files from.", ) diff --git a/unstructured/ingest/cli/cmds/sharepoint.py b/unstructured/ingest/cli/cmds/sharepoint.py new file mode 100644 index 000000000..e7fd3c261 --- /dev/null +++ b/unstructured/ingest/cli/cmds/sharepoint.py @@ -0,0 +1,72 @@ +import logging + +import click + +from unstructured.ingest.cli.common import ( + add_recursive_option, + add_shared_options, + log_options, + map_to_processor_config, + map_to_standard_config, + run_init_checks, +) +from unstructured.ingest.logger import ingest_log_streaming_init, logger +from unstructured.ingest.runner import sharepoint as sharepoint_fn + + +@click.command() +@click.option( + "--client-id", + default=None, + help="Sharepoint app client ID", +) +@click.option( + "--client-cred", + default=None, + help="Sharepoint app secret", +) +@click.option( + "--site", + default=None, + help="Sharepoint site url. Process either base url e.g https://[tenant].sharepoint.com \ + or relative sites https://[tenant].sharepoint.com/sites/.\ + To process all sites within the tenant pass a site url as\ + https://[tenant]-admin.sharepoint.com.\ + This requires the app to be registered at a tenant level", +) +@click.option( + "--path", + default="Shared Documents", + help="Path from which to start parsing files. If the connector is to process all sites \ + within the tenant this filter will be applied to all sites document libraries. \ + Default 'Shared Documents'", +) +@click.option( + "--files-only", + is_flag=True, + default=False, + help="Process only files.", +) +def sharepoint(**options): + verbose = options.get("verbose", False) + ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) + log_options(options) + try: + run_init_checks(**options) + connector_config = map_to_standard_config(options) + processor_config = map_to_processor_config(options) + sharepoint_fn( + connector_config=connector_config, + processor_config=processor_config, + **options, + ) + except Exception as e: + logger.error(e, exc_info=True) + raise click.ClickException(str(e)) from e + + +def get_cmd() -> click.Command: + cmd = sharepoint + add_recursive_option(cmd) + add_shared_options(cmd) + return cmd diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py index 24d0e5edc..5f40ee06e 100644 --- a/unstructured/ingest/connector/onedrive.py +++ b/unstructured/ingest/connector/onedrive.py @@ -27,7 +27,7 @@ class SimpleOneDriveConfig(BaseConnectorConfig): user_pname: str tenant: str = field(repr=False) authority_url: Optional[str] = field(repr=False) - folder: Optional[str] = field(default="") + path: Optional[str] = field(default="") recursive: bool = False def __post_init__(self): @@ -150,7 +150,7 @@ class OneDriveConnector(ConnectorCleanupMixin, BaseConnector): def get_ingest_docs(self): root = self.client.users[self.config.user_pname].drive.get().execute_query().root - if fpath := self.config.folder: + if fpath := self.config.path: root = root.get_by_path(fpath).get().execute_query() if root is None or not root.is_folder: raise ValueError(f"Unable to find directory, given: {fpath}") diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py new file mode 100644 index 000000000..840ccf894 --- /dev/null +++ b/unstructured/ingest/connector/sharepoint.py @@ -0,0 +1,328 @@ +from dataclasses import dataclass, field +from html import unescape +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional +from urllib.parse import urlparse + +from unstructured.file_utils.filetype import EXT_TO_FILETYPE +from unstructured.ingest.interfaces import ( + BaseConnector, + BaseConnectorConfig, + BaseIngestDoc, + ConnectorCleanupMixin, + IngestDocCleanupMixin, + StandardConnectorConfig, +) +from unstructured.ingest.logger import logger +from unstructured.utils import requires_dependencies + +if TYPE_CHECKING: + from office365.sharepoint.files.file import File + +MAX_MB_SIZE = 512_000_000 + + +@dataclass +class SimpleSharepointConfig(BaseConnectorConfig): + client_id: str + client_credential: str = field(repr=False) + site_url: str + path: str + process_pages: bool = False + recursive: bool = False + + def __post_init__(self): + if not (self.client_id and self.client_credential and self.site_url): + raise ValueError( + "Please provide one of the following mandatory values:" + "\n--client-id\n--client-cred\n--site", + ) + + +@dataclass +class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): + config: SimpleSharepointConfig + file: "File" + meta: dict + + def __post_init__(self): + self.ext = "".join(Path(self.file.name).suffixes) if not self.meta else ".html" + self.ext = self.ext if self.ext != ".aspx" else ".html" + + if not self.ext: + raise ValueError("Unsupported file without extension.") + + if self.ext not in EXT_TO_FILETYPE: + raise ValueError( + f"Extension {self.ext} not supported. " + f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", + ) + self._set_download_paths() + + def _set_download_paths(self) -> None: + """Parses the folder structure from the source and creates the download and output paths""" + download_path = Path(f"{self.standard_config.download_dir}") + output_path = Path(f"{self.standard_config.output_dir}") + if self.meta: + page_url = self.meta["page"].get_property("Url", "") + parent = ( + Path(page_url).with_suffix(self.ext) + if (self.meta["site_path"] is None) + else Path(self.meta["site_path"] + "/" + page_url).with_suffix(self.ext) + ) + else: + parent = Path(self.file.serverRelativeUrl[1:]) + self.download_dir = (download_path / parent.parent).resolve() + self.download_filepath = (download_path / parent).resolve() + oname = f"{str(parent)[:-len(self.ext)]}.json" + self.output_dir = (output_path / parent.parent).resolve() + self.output_filepath = (output_path / oname).resolve() + + @property + def filename(self): + return Path(self.download_filepath).resolve() + + @property + def _output_filename(self): + return Path(self.output_filepath).resolve() + + @property + def date_created(self) -> Optional[str]: + if self.meta: + return self.meta["page"].properties.get("FirstPublished", None) + return self.file.time_created + + @property + def date_modified(self) -> Optional[str]: + if self.meta: + return self.meta["page"].properties.get("Modified", None) + return self.file.time_last_modified + + @property + def exists(self) -> Optional[bool]: + if self.meta: + return self.meta["page"].properties.get("FileName", None) and self.meta[ + "page" + ].properties.get("UniqueId", None) + return self.file.exists + + @property + def record_locator(self) -> Optional[Dict[str, Any]]: + if self.meta: + record_source = self.meta["page"] + property_name = "AbsoluteUrl" + resource_url_name = "absolute_url" + else: + record_source = self.file + property_name = "ServerRelativeUrl" + resource_url_name = "server_relative_url" + + return { + "site": self.config.site_url, + "unique_id": record_source.get_property("UniqueId", ""), + resource_url_name: record_source.get_property(property_name, ""), + } + + @property + def version(self) -> Optional[str]: + if self.meta: + return self.meta["page"].properties.get("Version", "") + + if (n_versions := len(self.file.versions)) > 0: + return self.file.versions[n_versions - 1].properties.get("id", None) + return None + + def _get_page(self): + """Retrieves HTML content of the Sharepoint site through the CanvasContent1 and + LayoutWebpartsContent1""" + + try: + content_labels = ["CanvasContent1", "LayoutWebpartsContent1"] + content = self.file.listItemAllFields.select(content_labels).get().execute_query() + pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + ( + content.properties.get("CanvasContent1", "") or "" + ) + if pld != "": + pld = unescape(pld) + else: + logger.info( + f"Page {self.meta['page'].get_property('Url', '')} has no retrievable content. \ + Dumping empty doc.", + ) + pld = "
" + + self.output_dir.mkdir(parents=True, exist_ok=True) + if not self.download_dir.is_dir(): + logger.debug(f"Creating directory: {self.download_dir}") + self.download_dir.mkdir(parents=True, exist_ok=True) + with self.filename.open(mode="w") as f: + f.write(pld) + except Exception as e: + logger.error(f"Error while downloading and saving file: {self.filename}.") + logger.error(e) + return + logger.info(f"File downloaded: {self.filename}") + + def _get_file(self): + try: + fsize = self.file.length + self.output_dir.mkdir(parents=True, exist_ok=True) + + if not self.download_dir.is_dir(): + logger.debug(f"Creating directory: {self.download_dir}") + self.download_dir.mkdir(parents=True, exist_ok=True) + + if fsize > MAX_MB_SIZE: + logger.info(f"Downloading file with size: {fsize} bytes in chunks") + with self.filename.open(mode="wb") as f: + self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() + else: + with self.filename.open(mode="wb") as f: + self.file.download(f).execute_query() + except Exception as e: + logger.error(f"Error while downloading and saving file: {self.filename}.") + logger.error(e) + return + logger.info(f"File downloaded: {self.filename}") + + @BaseIngestDoc.skip_if_file_exists + @requires_dependencies(["office365"]) + def get_file(self): + if not self.meta: + self._get_file() + else: + self._get_page() + return + + +class SharepointConnector(ConnectorCleanupMixin, BaseConnector): + config: SimpleSharepointConfig + tenant: None + + def __init__(self, standard_config: StandardConnectorConfig, config: SimpleSharepointConfig): + super().__init__(standard_config, config) + self._setup_client() + + @requires_dependencies(["office365"]) + def _setup_client(self): + from office365.runtime.auth.client_credential import ClientCredential + from office365.sharepoint.client_context import ClientContext + + parsed_url = urlparse(self.config.site_url) + site_hostname = (parsed_url.hostname or "").split(".") + tenant_url = site_hostname[0].split("-") + self.process_all = False + self.base_site_url = "" + if tenant_url[-1] == "admin" and (parsed_url.path is None or parsed_url.path == "/"): + self.process_all = True + self.base_site_url = parsed_url._replace( + netloc=parsed_url.netloc.replace(site_hostname[0], tenant_url[0]), + ).geturl() + elif tenant_url[-1] == "admin": + raise ValueError( + "A site url in the form of https://[tenant]-admin.sharepoint.com \ + is required to process all sites within a tenant. ", + ) + + self.client = ClientContext(self.config.site_url).with_credentials( + ClientCredential(self.config.client_id, self.config.client_credential), + ) + + @requires_dependencies(["office365"]) + def _list_files(self, folder, recursive) -> List["File"]: + from office365.runtime.client_request_exception import ClientRequestException + + try: + objects = folder.expand(["Files", "Folders"]).get().execute_query() + files = list(objects.files) + if not recursive: + return files + for f in objects.folders: + if "/Forms" in f.serverRelativeUrl: + continue + files += self._list_files(f, recursive) + return files + except ClientRequestException as e: + if e.response.status_code != 404: + logger.info("Caught an error while processing documents %s", e.response.text) + return [] + + @requires_dependencies(["office365"]) + def _list_pages(self, site_client) -> list: + from office365.runtime.client_request_exception import ClientRequestException + + try: + pages = site_client.site_pages.pages.get().execute_query() + page_files = [] + + for page_meta in pages: + page_url = page_meta.get_property("Url", None) + if page_url is None: + logger.info("Missing site_url. Omitting page... ") + break + page_url = f"/{page_url}" if page_url[0] != "/" else page_url + file_page = site_client.web.get_file_by_server_relative_path(page_url) + site_path = None + if (url_path := (urlparse(site_client.base_url).path)) and (url_path != "/"): + site_path = url_path[1:] + page_files.append( + [file_page, {"page": page_meta, "site_path": site_path}], + ) + except ClientRequestException as e: + logger.info("Caught an error while processing pages %s", e.response.text) + return [] + + return page_files + + def initialize(self): + pass + + def _ingest_site_docs(self, site_client) -> List["SharepointIngestDoc"]: + root_folder = site_client.web.get_folder_by_server_relative_path(self.config.path) + files = self._list_files(root_folder, self.config.recursive) + if not files: + logger.info( + f"Couldn't process files in path {self.config.path} \ + for site {site_client.base_url}", + ) + output = [SharepointIngestDoc(self.standard_config, self.config, f, {}) for f in files] + if self.config.process_pages: + page_files = self._list_pages(site_client) + if not page_files: + logger.info(f"Couldn't process pages for site {site_client.base_url}") + page_output = [ + SharepointIngestDoc(self.standard_config, self.config, f[0], f[1]) + for f in page_files + ] + output = output + page_output + return output + + def _filter_site_url(self, site): + if site.url is None: + return False + return (site.url[0 : len(self.base_site_url)] == self.base_site_url) and ( # noqa: E203 + "/sites/" in site.url + ) + + @requires_dependencies(["office365"]) + def get_ingest_docs(self): + if self.process_all: + logger.debug(self.base_site_url) + from office365.runtime.auth.client_credential import ClientCredential + from office365.sharepoint.client_context import ClientContext + from office365.sharepoint.tenant.administration.tenant import Tenant + + tenant = Tenant(self.client) + tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query() + tenant_sites = [s.url for s in tenant_sites if self._filter_site_url(s)] + tenant_sites.append(self.base_site_url) + ingest_docs: List[SharepointIngestDoc] = [] + for site_url in set(tenant_sites): + logger.info(f"Processing docs for site: {site_url}") + site_client = ClientContext(site_url).with_credentials( + ClientCredential(self.config.client_id, self.config.client_credential), + ) + ingest_docs = ingest_docs + self._ingest_site_docs(site_client) + return ingest_docs + else: + return self._ingest_site_docs(self.client) diff --git a/unstructured/ingest/runner/__init__.py b/unstructured/ingest/runner/__init__.py index 74923e939..6f794d153 100644 --- a/unstructured/ingest/runner/__init__.py +++ b/unstructured/ingest/runner/__init__.py @@ -16,6 +16,7 @@ from .onedrive import onedrive from .outlook import outlook from .reddit import reddit from .s3 import s3 +from .sharepoint import sharepoint from .slack import slack from .wikipedia import wikipedia @@ -38,6 +39,7 @@ __all__ = [ "outlook", "reddit", "s3", + "sharepoint", "slack", "wikipedia", ] diff --git a/unstructured/ingest/runner/onedrive.py b/unstructured/ingest/runner/onedrive.py index 93d82ae73..4bc6cf97d 100644 --- a/unstructured/ingest/runner/onedrive.py +++ b/unstructured/ingest/runner/onedrive.py @@ -17,7 +17,7 @@ def onedrive( client_id: str, client_cred: str, authority_url: Optional[str], - onedrive_folder: Optional[str], + path: Optional[str], recursive: bool, **kwargs, ): @@ -45,7 +45,7 @@ def onedrive( user_pname=user_pname, tenant=tenant, authority_url=authority_url, - folder=onedrive_folder, + path=path, recursive=recursive, ), ) diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py new file mode 100644 index 000000000..6eb7c8ed3 --- /dev/null +++ b/unstructured/ingest/runner/sharepoint.py @@ -0,0 +1,50 @@ +import hashlib +import logging + +from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig +from unstructured.ingest.logger import ingest_log_streaming_init, logger +from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.utils import update_download_dir_hash + + +def sharepoint( + verbose: bool, + connector_config: StandardConnectorConfig, + processor_config: ProcessorConfigs, + site: str, + client_id: str, + client_cred: str, + files_only: bool, + path: str, + recursive: bool, + **kwargs, +): + ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + f"{site}_{path}".encode("utf-8"), + ) + connector_config.download_dir = update_download_dir_hash( + connector_config=connector_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.sharepoint import ( + SharepointConnector, + SimpleSharepointConfig, + ) + + doc_connector = SharepointConnector( # type: ignore + standard_config=connector_config, + config=SimpleSharepointConfig( + client_id=client_id, + client_credential=client_cred, + site_url=site, + path=path, + process_pages=(not files_only), + recursive=recursive, + ), + ) + + process_documents(doc_connector=doc_connector, processor_config=processor_config)