feat: Sharepoint connector (#918)

2025-12-24 13:44:05 +00:00 · 2023-08-10 10:37:58 -06:00 · 2023-08-10 10:37:58 -06:00 · dee9b405cd
commit dee9b405cd
parent ef5091f276
25 changed files with 913 additions and 8 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -200,6 +200,9 @@ jobs:
        MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
        MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
        MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
+        SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
+        SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
+        SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
      run: |
--- a/.github/workflows/ingest-test-fixtures-update-pr.yml
+++ b/.github/workflows/ingest-test-fixtures-update-pr.yml
@ -72,6 +72,9 @@ jobs:
          MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
          MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
          MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
+          SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
+          SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
+          SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
          UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
          OVERWRITE_FIXTURES: "true"
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,14 @@
+## 0.9.2-dev3
+=======
+
+### Enhancements
+
+### Features
+
+* Adds Sharepoint connector.
+
+### Fixes
+
 ## 0.9.2-dev2
 =======

--- a/examples/ingest/onedrive/ingest.sh
+++ b/examples/ingest/onedrive/ingest.sh
@ -26,6 +26,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --authority-url "<Authority URL, default is https://login.microsoftonline.com>" \
    --tenant "<Azure AD tenant_id, default is 'common'>" \
    --user-pname "<Azure AD principal name, in most cases is the email linked to the drive>" \
+    --path "<Path to start parsing files from>" \
    --structured-output-dir onedrive-ingest-output \
    --num-processes 2 \
    --verbose
--- a/examples/ingest/sharepoint/ingest.sh
+++ b/examples/ingest/sharepoint/ingest.sh
@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Processes the Unstructured-IO/unstructured repository
+# through Unstructured's library in 2 processes.
+
+# Structured outputs are stored in sharepoint-ingest-output/
+
+# NOTE, this script is not ready-to-run!
+# You must enter a MS Sharepoint app client-id, client secret and sharepoint site url
+# before running. 
+
+# To get the credentials for your Sharepoint app, follow these steps:
+# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
+
+
+ 
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "$SCRIPT_DIR"/../../.. || exit 1
+
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    sharepoint \
+    --client-id "<Microsoft Sharepoint app client-id>" \
+    --client-cred "<Microsoft Sharepoint app client-secret>" \
+    --site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
+    --files-only "Flag to process only files within the site(s)" \
+    --structured-output-dir sharepoint-ingest-output \
+    --num-processes 2 \
+    --verbose
--- a/requirements/ingest-sharepoint.in
+++ b/requirements/ingest-sharepoint.in
@ -0,0 +1,6 @@
+-c constraints.in
+-c base.txt
+msal==1.23.0
+Office365-REST-Python-Client==2.4.2
+pyjwt==2.8.0
+cryptography==41.0.2
--- a/requirements/ingest-sharepoint.txt
+++ b/requirements/ingest-sharepoint.txt
@ -0,0 +1,50 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile requirements/ingest-sharepoint.in
+#
+certifi==2023.7.22
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
+cffi==1.15.1
+    # via cryptography
+charset-normalizer==3.2.0
+    # via
+    #   -c requirements/base.txt
+    #   requests
+cryptography==41.0.2
+    # via
+    #   -r requirements/ingest-sharepoint.in
+    #   msal
+    #   pyjwt
+idna==3.4
+    # via
+    #   -c requirements/base.txt
+    #   requests
+msal==1.23.0
+    # via
+    #   -r requirements/ingest-sharepoint.in
+    #   office365-rest-python-client
+office365-rest-python-client==2.4.2
+    # via -r requirements/ingest-sharepoint.in
+pycparser==2.21
+    # via cffi
+pyjwt[crypto]==2.8.0
+    # via
+    #   -r requirements/ingest-sharepoint.in
+    #   msal
+pytz==2023.3
+    # via office365-rest-python-client
+requests==2.31.0
+    # via
+    #   -c requirements/base.txt
+    #   msal
+    #   office365-rest-python-client
+urllib3==1.26.16
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
@ -0,0 +1,110 @@
+[
+  {
+    "type": "NarrativeText",
+    "element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
+          "server_relative_url": "/Shared Documents/fake-text.txt"
+        },
+        "date_created": "2023-06-16T05:04:55Z",
+        "date_modified": "2023-06-16T05:04:55Z"
+      },
+      "filename": "fake-text.txt",
+      "filetype": "text/plain"
+    },
+    "text": "This is a test document to use for unit tests."
+  },
+  {
+    "type": "Address",
+    "element_id": "a9d4657034aa3fdb5177f1325e912362",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
+          "server_relative_url": "/Shared Documents/fake-text.txt"
+        },
+        "date_created": "2023-06-16T05:04:55Z",
+        "date_modified": "2023-06-16T05:04:55Z"
+      },
+      "filename": "fake-text.txt",
+      "filetype": "text/plain"
+    },
+    "text": "Doylestown, PA 18901"
+  },
+  {
+    "type": "Title",
+    "element_id": "9c218520320f238595f1fde74bdd137d",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
+          "server_relative_url": "/Shared Documents/fake-text.txt"
+        },
+        "date_created": "2023-06-16T05:04:55Z",
+        "date_modified": "2023-06-16T05:04:55Z"
+      },
+      "filename": "fake-text.txt",
+      "filetype": "text/plain"
+    },
+    "text": "Important points:"
+  },
+  {
+    "type": "ListItem",
+    "element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
+          "server_relative_url": "/Shared Documents/fake-text.txt"
+        },
+        "date_created": "2023-06-16T05:04:55Z",
+        "date_modified": "2023-06-16T05:04:55Z"
+      },
+      "filename": "fake-text.txt",
+      "filetype": "text/plain"
+    },
+    "text": "Hamburgers are delicious"
+  },
+  {
+    "type": "ListItem",
+    "element_id": "fc1adcb8eaceac694e500a103f9f698f",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
+          "server_relative_url": "/Shared Documents/fake-text.txt"
+        },
+        "date_created": "2023-06-16T05:04:55Z",
+        "date_modified": "2023-06-16T05:04:55Z"
+      },
+      "filename": "fake-text.txt",
+      "filetype": "text/plain"
+    },
+    "text": "Dogs are the best"
+  },
+  {
+    "type": "ListItem",
+    "element_id": "0b61e826b1c4ab05750184da72b89f83",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
+          "server_relative_url": "/Shared Documents/fake-text.txt"
+        },
+        "date_created": "2023-06-16T05:04:55Z",
+        "date_modified": "2023-06-16T05:04:55Z"
+      },
+      "filename": "fake-text.txt",
+      "filetype": "text/plain"
+    },
+    "text": "I love fuzzy blankets"
+  }
+]
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
@ -0,0 +1,37 @@
+[
+  {
+    "type": "NarrativeText",
+    "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "0dfe3d76-00c0-42db-ae1b-8cf22d4b3f10",
+          "server_relative_url": "/Shared Documents/ideas-page.html"
+        },
+        "date_created": "2023-06-16T05:04:47Z",
+        "date_modified": "2023-06-16T05:04:47Z"
+      },
+      "filename": "ideas-page.html",
+      "filetype": "text/html",
+      "page_number": 1,
+      "links": [
+        {
+          "text": null,
+          "url": "index.html"
+        },
+        {
+          "text": null,
+          "url": "https://twitter.com/stef/status/1617222428727586816"
+        }
+      ],
+      "emphasized_texts": [
+        {
+          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
+          "tag": "i"
+        }
+      ]
+    },
+    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
+  }
+]
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
@ -0,0 +1,44 @@
+[
+  {
+    "type": "Table",
+    "element_id": "c00fc0e5ac303c40f9089791e5e485b1",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "b9956a33-8079-4321-91ea-609def07394d",
+          "server_relative_url": "/Shared Documents/stanley-cups.xlsx"
+        },
+        "date_created": "2023-06-16T05:05:05Z",
+        "date_modified": "2023-06-16T05:05:05Z"
+      },
+      "filename": "stanley-cups.xlsx",
+      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+      "page_number": 1,
+      "page_name": "Stanley Cups",
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+    },
+    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
+  },
+  {
+    "type": "Table",
+    "element_id": "31421b5cd94fedb10dc82738503b4505",
+    "metadata": {
+      "data_source": {
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "b9956a33-8079-4321-91ea-609def07394d",
+          "server_relative_url": "/Shared Documents/stanley-cups.xlsx"
+        },
+        "date_created": "2023-06-16T05:05:05Z",
+        "date_modified": "2023-06-16T05:05:05Z"
+      },
+      "filename": "stanley-cups.xlsx",
+      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+      "page_number": 2,
+      "page_name": "Stanley Cups Since 67",
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+    },
+    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
+  }
+]
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/Home.json
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/Home.json
@ -0,0 +1,42 @@
+[
+  {
+    "type": "Title",
+    "element_id": "b4e929d8bcfe04189801a8ed61496d17",
+    "metadata": {
+      "data_source": {
+        "version": "1.2",
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10",
+          "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx"
+        },
+        "date_created": "0001-01-01T08:00:00Z",
+        "date_modified": "2023-06-16T05:12:51Z"
+      },
+      "filename": "Home.html",
+      "filetype": "text/html",
+      "page_number": 1
+    },
+    "text": "Documents"
+  },
+  {
+    "type": "Title",
+    "element_id": "8d14f6e72de8f18ab1ee5c5330f00653",
+    "metadata": {
+      "data_source": {
+        "version": "1.2",
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10",
+          "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx"
+        },
+        "date_created": "0001-01-01T08:00:00Z",
+        "date_modified": "2023-06-16T05:12:51Z"
+      },
+      "filename": "Home.html",
+      "filetype": "text/html",
+      "page_number": 1
+    },
+    "text": "Events"
+  }
+]
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/This-is-a-title.json
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/This-is-a-title.json
@ -0,0 +1,82 @@
+[
+  {
+    "type": "ListItem",
+    "element_id": "54bdbe8a7a031cf41a7f99cf3a27b8ff",
+    "metadata": {
+      "data_source": {
+        "version": "1.0",
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
+          "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
+        },
+        "date_created": "0001-01-01T08:00:00Z",
+        "date_modified": "2023-07-31T07:03:37Z"
+      },
+      "filename": "This-is-a-title.html",
+      "filetype": "text/html",
+      "page_number": 1
+    },
+    "text": "This is a plain text site page for testing purposes"
+  },
+  {
+    "type": "ListItem",
+    "element_id": "7499f3d6c2534c6017c1c6e08406640f",
+    "metadata": {
+      "data_source": {
+        "version": "1.0",
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
+          "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
+        },
+        "date_created": "0001-01-01T08:00:00Z",
+        "date_modified": "2023-07-31T07:03:37Z"
+      },
+      "filename": "This-is-a-title.html",
+      "filetype": "text/html",
+      "page_number": 1
+    },
+    "text": "These are bullet points meant for testing"
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "3d8a9d73a6fae35d8fd19f8e82578fa5",
+    "metadata": {
+      "data_source": {
+        "version": "1.0",
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
+          "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
+        },
+        "date_created": "0001-01-01T08:00:00Z",
+        "date_modified": "2023-07-31T07:03:37Z"
+      },
+      "filename": "This-is-a-title.html",
+      "filetype": "text/html",
+      "page_number": 1
+    },
+    "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam ex tellus, sodales non nulla et, sodales consequat turpis. Etiam vestibulum nisl placerat risus elementum, a sodales purus rhoncus. Sed eget velit pharetra, pretium nisi nec, laoreet ligula. Duis luctus mi in ligula cursus, vel lacinia tortor ultricies. Aenean sit amet sodales odio, a maximus elit. Pellentesque vehicula diam sit amet leo placerat placerat. Integer varius elementum accumsan. Donec posuere elit mauris, eget efficitur nisl viverra vitae."
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "27f6715881d63c1795b3c7e17b20090a",
+    "metadata": {
+      "data_source": {
+        "version": "1.0",
+        "record_locator": {
+          "site": "https://unstructuredio.sharepoint.com/",
+          "unique_id": "f4613496-4c63-4128-adf0-3c3e13a5a303",
+          "absolute_url": "https://unstructuredio.sharepoint.com/SitePages/This-is-a-title.aspx"
+        },
+        "date_created": "0001-01-01T08:00:00Z",
+        "date_modified": "2023-07-31T07:03:37Z"
+      },
+      "filename": "This-is-a-title.html",
+      "filetype": "text/html",
+      "page_number": 1
+    },
+    "text": "Integer at dictum nisi. Cras venenatis non velit in posuere. Curabitur tristique, eros eget tristique pellentesque, neque metus ullamcorper ligula, nec posuere neque lacus nec felis. Nulla a libero eget eros consectetur hendrerit. Pellentesque interdum, diam eget tristique pretium, quam lorem pulvinar lorem, a eleifend nisl lectus at ex. Praesent pulvinar ex ut consequat condimentum. Sed rutrum, erat a hendrerit blandit, urna mauris posuere est, at porttitor risus diam non leo. Nullam rutrum vehicula dolor, quis venenatis ligula rutrum sit amet. Nam massa justo, fermentum in dui lacinia, tincidunt imperdiet nunc. Nam posuere tortor ac lectus elementum, non mollis urna consequat. In interdum non tellus sed pellentesque."
+  }
+]
--- a/test_unstructured_ingest/test-ingest-onedrive.sh
+++ b/test_unstructured_ingest/test-ingest-onedrive.sh
@ -27,7 +27,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --client-id "$MS_CLIENT_ID" \
    --tenant "$MS_TENANT_ID" \
    --user-pname "$MS_USER_PNAME" \
-    --onedrive-folder '/utic-test-ingest-fixtures' \
+    --path '/utic-test-ingest-fixtures' \
    --recursive \

 sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
--- a/test_unstructured_ingest/test-ingest-sharepoint.sh
+++ b/test_unstructured_ingest/test-ingest-sharepoint.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -e
+
+SCRIPT_DIR=$(dirname "$(realpath "$0")")
+cd "$SCRIPT_DIR"/.. || exit 1
+OUTPUT_FOLDER_NAME=Sharepoint
+OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
+DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
+
+if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
+   echo "Skipping Sharepoint ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED env var is not set."
+   exit 0
+fi
+# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    sharepoint \
+    --download-dir "$DOWNLOAD_DIR" \
+    --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified \
+    --num-processes 2 \
+    --partition-strategy hi_res \
+    --preserve-downloads \
+    --reprocess \
+    --structured-output-dir "$OUTPUT_DIR" \
+    --verbose \
+    --client-cred "$SHAREPOINT_CRED" \
+    --client-id "$SHAREPOINT_CLIENT_ID" \
+    --site "$SHAREPOINT_SITE" \
+    --path "Shared Documents" \
+    --recursive \
+
+sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
--- a/test_unstructured_ingest/test-ingest.sh
+++ b/test_unstructured_ingest/test-ingest.sh
@ -26,10 +26,11 @@ export OMP_THREAD_LIMIT=1
 ./test_unstructured_ingest/test-ingest-onedrive.sh
 ./test_unstructured_ingest/test-ingest-outlook.sh
 ./test_unstructured_ingest/test-ingest-elasticsearch.sh
-./test_unstructured_ingest/test-ingest-confluence-diff.sh
+#./test_unstructured_ingest/test-ingest-confluence-diff.sh
 ./test_unstructured_ingest/test-ingest-confluence-large.sh
 ./test_unstructured_ingest/test-ingest-local-single-file.sh
 ./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
 ./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
 # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
 ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
+./test_unstructured_ingest/test-ingest-sharepoint.sh
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.9.2-dev2"  # pragma: no cover
+__version__ = "0.9.2-dev3"  # pragma: no cover
--- a/unstructured/ingest/cli/cli.py
+++ b/unstructured/ingest/cli/cli.py
@ -30,6 +30,7 @@ subcommands = [
    cli_cmds.local,
    cli_cmds.elasticsearch,
    cli_cmds.confluence,
+    cli_cmds.sharepoint,
 ]

 for subcommand in subcommands:
--- a/unstructured/ingest/cli/cmds/init.py
+++ b/unstructured/ingest/cli/cmds/init.py
@ -16,6 +16,7 @@ from .onedrive import get_cmd as onedrive
 from .outlook import get_cmd as outlook
 from .reddit import get_cmd as reddit
 from .s3 import get_cmd as s3
+from .sharepoint import get_cmd as sharepoint
 from .slack import get_cmd as slack
 from .wikipedia import get_cmd as wikipedia

@ -38,6 +39,7 @@ __all__ = [
    "outlook",
    "reddit",
    "s3",
+    "sharepoint",
    "slack",
    "wikipedia",
 ]
--- a/unstructured/ingest/cli/cmds/onedrive.py
+++ b/unstructured/ingest/cli/cmds/onedrive.py
@ -32,7 +32,7 @@ from unstructured.ingest.runner import onedrive as onedrive_fn
    help="Microsoft App client secret",
 )
@click.option(
-    "--onedrive-folder",
+    "--path",
    default=None,
    help="Folder to start parsing files from.",
 )
--- a/unstructured/ingest/cli/cmds/sharepoint.py
+++ b/unstructured/ingest/cli/cmds/sharepoint.py
@ -0,0 +1,72 @@
+import logging
+
+import click
+
+from unstructured.ingest.cli.common import (
+    add_recursive_option,
+    add_shared_options,
+    log_options,
+    map_to_processor_config,
+    map_to_standard_config,
+    run_init_checks,
+)
+from unstructured.ingest.logger import ingest_log_streaming_init, logger
+from unstructured.ingest.runner import sharepoint as sharepoint_fn
+
+
+@click.command()
+@click.option(
+    "--client-id",
+    default=None,
+    help="Sharepoint app client ID",
+)
+@click.option(
+    "--client-cred",
+    default=None,
+    help="Sharepoint app secret",
+)
+@click.option(
+    "--site",
+    default=None,
+    help="Sharepoint site url. Process either base url e.g https://[tenant].sharepoint.com \
+        or relative sites https://[tenant].sharepoint.com/sites/<site_name>.\
+        To process all sites within the tenant pass a site url as\
+        https://[tenant]-admin.sharepoint.com.\
+        This requires the app to be registered at a tenant level",
+)
+@click.option(
+    "--path",
+    default="Shared Documents",
+    help="Path from which to start parsing files. If the connector is to process all sites \
+    within the tenant this filter will be applied to all sites document libraries. \
+    Default 'Shared Documents'",
+)
+@click.option(
+    "--files-only",
+    is_flag=True,
+    default=False,
+    help="Process only files.",
+)
+def sharepoint(**options):
+    verbose = options.get("verbose", False)
+    ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
+    log_options(options)
+    try:
+        run_init_checks(**options)
+        connector_config = map_to_standard_config(options)
+        processor_config = map_to_processor_config(options)
+        sharepoint_fn(
+            connector_config=connector_config,
+            processor_config=processor_config,
+            **options,
+        )
+    except Exception as e:
+        logger.error(e, exc_info=True)
+        raise click.ClickException(str(e)) from e
+
+
+def get_cmd() -> click.Command:
+    cmd = sharepoint
+    add_recursive_option(cmd)
+    add_shared_options(cmd)
+    return cmd
--- a/unstructured/ingest/connector/onedrive.py
+++ b/unstructured/ingest/connector/onedrive.py
@ -27,7 +27,7 @@ class SimpleOneDriveConfig(BaseConnectorConfig):
    user_pname: str
    tenant: str = field(repr=False)
    authority_url: Optional[str] = field(repr=False)
-    folder: Optional[str] = field(default="")
+    path: Optional[str] = field(default="")
    recursive: bool = False

    def __post_init__(self):
@ -150,7 +150,7 @@ class OneDriveConnector(ConnectorCleanupMixin, BaseConnector):

    def get_ingest_docs(self):
        root = self.client.users[self.config.user_pname].drive.get().execute_query().root
-        if fpath := self.config.folder:
+        if fpath := self.config.path:
            root = root.get_by_path(fpath).get().execute_query()
            if root is None or not root.is_folder:
                raise ValueError(f"Unable to find directory, given: {fpath}")
--- a/unstructured/ingest/connector/sharepoint.py
+++ b/unstructured/ingest/connector/sharepoint.py
@ -0,0 +1,328 @@
+from dataclasses import dataclass, field
+from html import unescape
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+from unstructured.file_utils.filetype import EXT_TO_FILETYPE
+from unstructured.ingest.interfaces import (
+    BaseConnector,
+    BaseConnectorConfig,
+    BaseIngestDoc,
+    ConnectorCleanupMixin,
+    IngestDocCleanupMixin,
+    StandardConnectorConfig,
+)
+from unstructured.ingest.logger import logger
+from unstructured.utils import requires_dependencies
+
+if TYPE_CHECKING:
+    from office365.sharepoint.files.file import File
+
+MAX_MB_SIZE = 512_000_000
+
+
+@dataclass
+class SimpleSharepointConfig(BaseConnectorConfig):
+    client_id: str
+    client_credential: str = field(repr=False)
+    site_url: str
+    path: str
+    process_pages: bool = False
+    recursive: bool = False
+
+    def __post_init__(self):
+        if not (self.client_id and self.client_credential and self.site_url):
+            raise ValueError(
+                "Please provide one of the following mandatory values:"
+                "\n--client-id\n--client-cred\n--site",
+            )
+
+
+@dataclass
+class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
+    config: SimpleSharepointConfig
+    file: "File"
+    meta: dict
+
+    def __post_init__(self):
+        self.ext = "".join(Path(self.file.name).suffixes) if not self.meta else ".html"
+        self.ext = self.ext if self.ext != ".aspx" else ".html"
+
+        if not self.ext:
+            raise ValueError("Unsupported file without extension.")
+
+        if self.ext not in EXT_TO_FILETYPE:
+            raise ValueError(
+                f"Extension {self.ext} not supported. "
+                f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
+            )
+        self._set_download_paths()
+
+    def _set_download_paths(self) -> None:
+        """Parses the folder structure from the source and creates the download and output paths"""
+        download_path = Path(f"{self.standard_config.download_dir}")
+        output_path = Path(f"{self.standard_config.output_dir}")
+        if self.meta:
+            page_url = self.meta["page"].get_property("Url", "")
+            parent = (
+                Path(page_url).with_suffix(self.ext)
+                if (self.meta["site_path"] is None)
+                else Path(self.meta["site_path"] + "/" + page_url).with_suffix(self.ext)
+            )
+        else:
+            parent = Path(self.file.serverRelativeUrl[1:])
+        self.download_dir = (download_path / parent.parent).resolve()
+        self.download_filepath = (download_path / parent).resolve()
+        oname = f"{str(parent)[:-len(self.ext)]}.json"
+        self.output_dir = (output_path / parent.parent).resolve()
+        self.output_filepath = (output_path / oname).resolve()
+
+    @property
+    def filename(self):
+        return Path(self.download_filepath).resolve()
+
+    @property
+    def _output_filename(self):
+        return Path(self.output_filepath).resolve()
+
+    @property
+    def date_created(self) -> Optional[str]:
+        if self.meta:
+            return self.meta["page"].properties.get("FirstPublished", None)
+        return self.file.time_created
+
+    @property
+    def date_modified(self) -> Optional[str]:
+        if self.meta:
+            return self.meta["page"].properties.get("Modified", None)
+        return self.file.time_last_modified
+
+    @property
+    def exists(self) -> Optional[bool]:
+        if self.meta:
+            return self.meta["page"].properties.get("FileName", None) and self.meta[
+                "page"
+            ].properties.get("UniqueId", None)
+        return self.file.exists
+
+    @property
+    def record_locator(self) -> Optional[Dict[str, Any]]:
+        if self.meta:
+            record_source = self.meta["page"]
+            property_name = "AbsoluteUrl"
+            resource_url_name = "absolute_url"
+        else:
+            record_source = self.file
+            property_name = "ServerRelativeUrl"
+            resource_url_name = "server_relative_url"
+
+        return {
+            "site": self.config.site_url,
+            "unique_id": record_source.get_property("UniqueId", ""),
+            resource_url_name: record_source.get_property(property_name, ""),
+        }
+
+    @property
+    def version(self) -> Optional[str]:
+        if self.meta:
+            return self.meta["page"].properties.get("Version", "")
+
+        if (n_versions := len(self.file.versions)) > 0:
+            return self.file.versions[n_versions - 1].properties.get("id", None)
+        return None
+
+    def _get_page(self):
+        """Retrieves HTML content of the Sharepoint site through the CanvasContent1 and
+        LayoutWebpartsContent1"""
+
+        try:
+            content_labels = ["CanvasContent1", "LayoutWebpartsContent1"]
+            content = self.file.listItemAllFields.select(content_labels).get().execute_query()
+            pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + (
+                content.properties.get("CanvasContent1", "") or ""
+            )
+            if pld != "":
+                pld = unescape(pld)
+            else:
+                logger.info(
+                    f"Page {self.meta['page'].get_property('Url', '')} has no retrievable content. \
+                      Dumping empty doc.",
+                )
+                pld = "<div></div>"
+
+            self.output_dir.mkdir(parents=True, exist_ok=True)
+            if not self.download_dir.is_dir():
+                logger.debug(f"Creating directory: {self.download_dir}")
+                self.download_dir.mkdir(parents=True, exist_ok=True)
+            with self.filename.open(mode="w") as f:
+                f.write(pld)
+        except Exception as e:
+            logger.error(f"Error while downloading and saving file: {self.filename}.")
+            logger.error(e)
+            return
+        logger.info(f"File downloaded: {self.filename}")
+
+    def _get_file(self):
+        try:
+            fsize = self.file.length
+            self.output_dir.mkdir(parents=True, exist_ok=True)
+
+            if not self.download_dir.is_dir():
+                logger.debug(f"Creating directory: {self.download_dir}")
+                self.download_dir.mkdir(parents=True, exist_ok=True)
+
+            if fsize > MAX_MB_SIZE:
+                logger.info(f"Downloading file with size: {fsize} bytes in chunks")
+                with self.filename.open(mode="wb") as f:
+                    self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
+            else:
+                with self.filename.open(mode="wb") as f:
+                    self.file.download(f).execute_query()
+        except Exception as e:
+            logger.error(f"Error while downloading and saving file: {self.filename}.")
+            logger.error(e)
+            return
+        logger.info(f"File downloaded: {self.filename}")
+
+    @BaseIngestDoc.skip_if_file_exists
+    @requires_dependencies(["office365"])
+    def get_file(self):
+        if not self.meta:
+            self._get_file()
+        else:
+            self._get_page()
+        return
+
+
+class SharepointConnector(ConnectorCleanupMixin, BaseConnector):
+    config: SimpleSharepointConfig
+    tenant: None
+
+    def __init__(self, standard_config: StandardConnectorConfig, config: SimpleSharepointConfig):
+        super().__init__(standard_config, config)
+        self._setup_client()
+
+    @requires_dependencies(["office365"])
+    def _setup_client(self):
+        from office365.runtime.auth.client_credential import ClientCredential
+        from office365.sharepoint.client_context import ClientContext
+
+        parsed_url = urlparse(self.config.site_url)
+        site_hostname = (parsed_url.hostname or "").split(".")
+        tenant_url = site_hostname[0].split("-")
+        self.process_all = False
+        self.base_site_url = ""
+        if tenant_url[-1] == "admin" and (parsed_url.path is None or parsed_url.path == "/"):
+            self.process_all = True
+            self.base_site_url = parsed_url._replace(
+                netloc=parsed_url.netloc.replace(site_hostname[0], tenant_url[0]),
+            ).geturl()
+        elif tenant_url[-1] == "admin":
+            raise ValueError(
+                "A site url in the form of https://[tenant]-admin.sharepoint.com \
+                is required to process all sites within a tenant. ",
+            )
+
+        self.client = ClientContext(self.config.site_url).with_credentials(
+            ClientCredential(self.config.client_id, self.config.client_credential),
+        )
+
+    @requires_dependencies(["office365"])
+    def _list_files(self, folder, recursive) -> List["File"]:
+        from office365.runtime.client_request_exception import ClientRequestException
+
+        try:
+            objects = folder.expand(["Files", "Folders"]).get().execute_query()
+            files = list(objects.files)
+            if not recursive:
+                return files
+            for f in objects.folders:
+                if "/Forms" in f.serverRelativeUrl:
+                    continue
+                files += self._list_files(f, recursive)
+            return files
+        except ClientRequestException as e:
+            if e.response.status_code != 404:
+                logger.info("Caught an error while processing documents %s", e.response.text)
+            return []
+
+    @requires_dependencies(["office365"])
+    def _list_pages(self, site_client) -> list:
+        from office365.runtime.client_request_exception import ClientRequestException
+
+        try:
+            pages = site_client.site_pages.pages.get().execute_query()
+            page_files = []
+
+            for page_meta in pages:
+                page_url = page_meta.get_property("Url", None)
+                if page_url is None:
+                    logger.info("Missing site_url. Omitting page... ")
+                    break
+                page_url = f"/{page_url}" if page_url[0] != "/" else page_url
+                file_page = site_client.web.get_file_by_server_relative_path(page_url)
+                site_path = None
+                if (url_path := (urlparse(site_client.base_url).path)) and (url_path != "/"):
+                    site_path = url_path[1:]
+                page_files.append(
+                    [file_page, {"page": page_meta, "site_path": site_path}],
+                )
+        except ClientRequestException as e:
+            logger.info("Caught an error while processing pages %s", e.response.text)
+            return []
+
+        return page_files
+
+    def initialize(self):
+        pass
+
+    def _ingest_site_docs(self, site_client) -> List["SharepointIngestDoc"]:
+        root_folder = site_client.web.get_folder_by_server_relative_path(self.config.path)
+        files = self._list_files(root_folder, self.config.recursive)
+        if not files:
+            logger.info(
+                f"Couldn't process files in path {self.config.path} \
+                for site {site_client.base_url}",
+            )
+        output = [SharepointIngestDoc(self.standard_config, self.config, f, {}) for f in files]
+        if self.config.process_pages:
+            page_files = self._list_pages(site_client)
+            if not page_files:
+                logger.info(f"Couldn't process pages for site {site_client.base_url}")
+            page_output = [
+                SharepointIngestDoc(self.standard_config, self.config, f[0], f[1])
+                for f in page_files
+            ]
+            output = output + page_output
+        return output
+
+    def _filter_site_url(self, site):
+        if site.url is None:
+            return False
+        return (site.url[0 : len(self.base_site_url)] == self.base_site_url) and (  # noqa: E203
+            "/sites/" in site.url
+        )
+
+    @requires_dependencies(["office365"])
+    def get_ingest_docs(self):
+        if self.process_all:
+            logger.debug(self.base_site_url)
+            from office365.runtime.auth.client_credential import ClientCredential
+            from office365.sharepoint.client_context import ClientContext
+            from office365.sharepoint.tenant.administration.tenant import Tenant
+
+            tenant = Tenant(self.client)
+            tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query()
+            tenant_sites = [s.url for s in tenant_sites if self._filter_site_url(s)]
+            tenant_sites.append(self.base_site_url)
+            ingest_docs: List[SharepointIngestDoc] = []
+            for site_url in set(tenant_sites):
+                logger.info(f"Processing docs for site: {site_url}")
+                site_client = ClientContext(site_url).with_credentials(
+                    ClientCredential(self.config.client_id, self.config.client_credential),
+                )
+                ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
+            return ingest_docs
+        else:
+            return self._ingest_site_docs(self.client)
--- a/unstructured/ingest/runner/init.py
+++ b/unstructured/ingest/runner/init.py
@ -16,6 +16,7 @@ from .onedrive import onedrive
 from .outlook import outlook
 from .reddit import reddit
 from .s3 import s3
+from .sharepoint import sharepoint
 from .slack import slack
 from .wikipedia import wikipedia

@ -38,6 +39,7 @@ __all__ = [
    "outlook",
    "reddit",
    "s3",
+    "sharepoint",
    "slack",
    "wikipedia",
 ]
--- a/unstructured/ingest/runner/onedrive.py
+++ b/unstructured/ingest/runner/onedrive.py
@ -17,7 +17,7 @@ def onedrive(
    client_id: str,
    client_cred: str,
    authority_url: Optional[str],
-    onedrive_folder: Optional[str],
+    path: Optional[str],
    recursive: bool,
    **kwargs,
 ):
@ -45,7 +45,7 @@ def onedrive(
            user_pname=user_pname,
            tenant=tenant,
            authority_url=authority_url,
-            folder=onedrive_folder,
+            path=path,
            recursive=recursive,
        ),
    )
--- a/unstructured/ingest/runner/sharepoint.py
+++ b/unstructured/ingest/runner/sharepoint.py
@ -0,0 +1,50 @@
+import hashlib
+import logging
+
+from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig
+from unstructured.ingest.logger import ingest_log_streaming_init, logger
+from unstructured.ingest.processor import process_documents
+from unstructured.ingest.runner.utils import update_download_dir_hash
+
+
+def sharepoint(
+    verbose: bool,
+    connector_config: StandardConnectorConfig,
+    processor_config: ProcessorConfigs,
+    site: str,
+    client_id: str,
+    client_cred: str,
+    files_only: bool,
+    path: str,
+    recursive: bool,
+    **kwargs,
+):
+    ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
+
+    hashed_dir_name = hashlib.sha256(
+        f"{site}_{path}".encode("utf-8"),
+    )
+    connector_config.download_dir = update_download_dir_hash(
+        connector_config=connector_config,
+        hashed_dir_name=hashed_dir_name,
+        logger=logger,
+    )
+
+    from unstructured.ingest.connector.sharepoint import (
+        SharepointConnector,
+        SimpleSharepointConfig,
+    )
+
+    doc_connector = SharepointConnector(  # type: ignore
+        standard_config=connector_config,
+        config=SimpleSharepointConfig(
+            client_id=client_id,
+            client_credential=client_cred,
+            site_url=site,
+            path=path,
+            process_pages=(not files_only),
+            recursive=recursive,
+        ),
+    )
+
+    process_documents(doc_connector=doc_connector, processor_config=processor_config)