mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
feat: add OneDrive connector (#834)
This commit is contained in:
parent
26da51c765
commit
ce20c3f2bc
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@ -188,6 +188,8 @@ jobs:
|
||||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
||||
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
|
||||
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
|
||||
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
|
||||
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
@ -206,6 +208,7 @@ jobs:
|
||||
make install-ingest-google-drive
|
||||
make install-ingest-github
|
||||
make install-ingest-gitlab
|
||||
make install-ingest-onedrive
|
||||
make install-ingest-slack
|
||||
make install-ingest-wikipedia
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
@ -63,6 +63,8 @@ jobs:
|
||||
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
|
||||
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
|
||||
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
|
||||
OVERWRITE_FIXTURES: "true"
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
@ -81,6 +83,7 @@ jobs:
|
||||
make install-ingest-google-drive
|
||||
make install-ingest-github
|
||||
make install-ingest-gitlab
|
||||
make install-ingest-onedrive
|
||||
make install-ingest-slack
|
||||
make install-ingest-wikipedia
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.8.2-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* Adds Onedrive connector.
|
||||
|
||||
## 0.8.2-dev0
|
||||
|
||||
### Enhancements
|
||||
|
4
Makefile
4
Makefile
@ -86,6 +86,10 @@ install-ingest-github:
|
||||
install-ingest-gitlab:
|
||||
python3 -m pip install -r requirements/ingest-gitlab.txt
|
||||
|
||||
.PHONY: install-ingest-onedrive
|
||||
install-ingest-onedrive:
|
||||
python3 -m pip install -r requirements/ingest-onedrive.txt
|
||||
|
||||
.PHONY: install-ingest-reddit
|
||||
install-ingest-reddit:
|
||||
python3 -m pip install -r requirements/ingest-reddit.txt
|
||||
|
30
examples/ingest/onedrive/onedrive.sh
Executable file
30
examples/ingest/onedrive/onedrive.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Processes the Unstructured-IO/unstructured repository
|
||||
# through Unstructured's library in 2 processes.
|
||||
|
||||
# Structured outputs are stored in onedrive-ingest-output/
|
||||
|
||||
# NOTE, this script is not ready-to-run!
|
||||
# You must enter a Azure AD app client-id, client secret and user principal name
|
||||
# before running.
|
||||
|
||||
# To get the credentials for your Azure AD app, follow these steps:
|
||||
# https://learn.microsoft.com/en-us/graph/auth-register-app-v2
|
||||
# https://learn.microsoft.com/en-us/graph/auth-v2-service
|
||||
|
||||
# Assign the neccesary permissions for the application to read from OneDrive.
|
||||
# https://learn.microsoft.com/en-us/graph/permissions-reference
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--ms-client-id "<Azure AD app client-id>" \
|
||||
--ms-client-cred "<Azure AD app client-secret>" \
|
||||
--ms-authority-url "<Authority URL, default is https://login.microsoftonline.com>" \
|
||||
--ms-tenant "<Azure AD tenant_id, default is 'common'>" \
|
||||
--ms-user-pname "<Azure AD principal name, in most cases is the email linked to the drive>" \
|
||||
--structured-output-dir onedrive-ingest-output \
|
||||
--num-processes 2 \
|
||||
--verbose
|
4
requirements/ingest-onedrive.in
Normal file
4
requirements/ingest-onedrive.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
msal
|
||||
Office365-REST-Python-Client
|
54
requirements/ingest-onedrive.txt
Normal file
54
requirements/ingest-onedrive.txt
Normal file
@ -0,0 +1,54 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile ingest-onedrive.in
|
||||
#
|
||||
certifi==2023.5.7
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# cryptography
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# requests
|
||||
cryptography==41.0.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# msal
|
||||
# pyjwt
|
||||
idna==3.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# requests
|
||||
msal==1.22.0
|
||||
# via
|
||||
# -r ingest-onedrive.in
|
||||
# office365-rest-python-client
|
||||
office365-rest-python-client==2.4.1
|
||||
# via -r ingest-onedrive.in
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c base.txt
|
||||
# cffi
|
||||
pyjwt[crypto]==2.7.0
|
||||
# via msal
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -c base.txt
|
||||
# office365-rest-python-client
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# msal
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c constraints.in
|
||||
# requests
|
@ -0,0 +1,62 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "This is a test document to use for unit tests."
|
||||
},
|
||||
{
|
||||
"type": "Address",
|
||||
"element_id": "a9d4657034aa3fdb5177f1325e912362",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Doylestown, PA 18901"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9c218520320f238595f1fde74bdd137d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Important points:"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Hamburgers are delicious"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Dogs are the best"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "0b61e826b1c4ab05750184da72b89f83",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "I love fuzzy blankets"
|
||||
}
|
||||
]
|
@ -0,0 +1,62 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "This is a test document to use for unit tests."
|
||||
},
|
||||
{
|
||||
"type": "Address",
|
||||
"element_id": "a9d4657034aa3fdb5177f1325e912362",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Doylestown, PA 18901"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9c218520320f238595f1fde74bdd137d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Important points:"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Hamburgers are delicious"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Dogs are the best"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "0b61e826b1c4ab05750184da72b89f83",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "fake-text.txt",
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "I love fuzzy blankets"
|
||||
}
|
||||
]
|
@ -0,0 +1,41 @@
|
||||
[
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "b3e92c24311471ee2c4884b010dd55a0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "tests-example.xls",
|
||||
"filetype": "application/vnd.ms-excel",
|
||||
"page_number": 1,
|
||||
"page_name": "Example Test",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n MA\n What C datatypes are 8 bits? (assume i386)\n int\n \n float\n \n double\n \n char\n \n \n TF\n Bagpipes are awesome.\n true\n \n \n \n \n \n \n \n \n ESS\n How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n \n \n \n \n \n \n \n \n \n ORD\n Rank the following in their order of operation.\n Parentheses\n Exponents\n Division\n Addition\n \n \n \n \n \n FIB\n The student activities fee is\n 95\n dollars for students enrolled in\n 19\n units or more,\n \n \n \n \n \n MAT\n Match the lower-case greek letter with its capital form.\n λ\n Λ\n α\n γ\n Γ\n φ\n Φ\n \n \n"
|
||||
},
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "adf2eb068afa00f6dfaa4adf8195ce25",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "tests-example.xls",
|
||||
"filetype": "application/vnd.ms-excel",
|
||||
"page_number": 2,
|
||||
"page_name": "Format Abbr.",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Question Format Abbreviations</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n \n \n Question Format Abbreviations\n \n \n \n \n \n \n \n Abbreviation\n Question Type\n \n \n MC\n Multiple Choice\n \n \n MA\n Multiple Answer\n \n \n TF\n True/False\n \n \n ESS\n Essay\n \n \n ORD\n Ordering\n \n \n MAT\n Matching\n \n \n FIB\n Fill in the Blank\n \n \n FIL\n File response\n \n \n NUM\n Numeric Response\n \n \n SR\n Short response\n \n \n OP\n Opinion\n \n \n FIB_PLUS\n Multiple Fill in the Blank\n \n \n JUMBLED_SENTENCE\n Jumbled Sentence\n \n \n QUIZ_BOWL\n Quiz Bowl\n \n \n"
|
||||
},
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "55c06f516945f32a0187cfd94ba7e074",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "tests-example.xls",
|
||||
"filetype": "application/vnd.ms-excel",
|
||||
"page_number": 3,
|
||||
"page_name": "Readme",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>File Information</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Source</td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Version</td>\n </tr>\n <tr>\n <td>1.0 (January 2012)</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Contact</td>\n </tr>\n <tr>\n <td>bb-help@andrew.cmu.edu</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>About</td>\n </tr>\n <tr>\n <td>This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n File Information\n \n \n \n \n \n \n \n \n Source\n \n \n http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n \n \n \n \n \n \n \n \n Version\n \n \n 1.0 (January 2012)\n \n \n \n \n \n \n \n \n Contact\n \n \n bb-help@andrew.cmu.edu\n \n \n \n \n \n \n \n \n About\n \n \n This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n \n \n"
|
||||
}
|
||||
]
|
31
test_unstructured_ingest/test-ingest-onedrive.sh
Executable file
31
test_unstructured_ingest/test-ingest-onedrive.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=onedrive
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ]; then
|
||||
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED env var is not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--ms-client-cred "$MS_CLIENT_CRED" \
|
||||
--ms-client-id "$MS_CLIENT_ID" \
|
||||
--ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \
|
||||
--ms-user-pname "devops@unstructuredio.onmicrosoft.com" \
|
||||
--ms-onedrive-folder '/utic-test-ingest-fixtures' \
|
||||
--metadata-exclude file_directory,metadata.data_source.date_processed \
|
||||
--num-processes 2 \
|
||||
--partition-strategy hi_res \
|
||||
--preserve-downloads \
|
||||
--recursive \
|
||||
--reprocess \
|
||||
--structured-output-dir "$OUTPUT_DIR"
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
@ -21,6 +21,7 @@ export OMP_THREAD_LIMIT=1
|
||||
./test_unstructured_ingest/test-ingest-slack.sh
|
||||
./test_unstructured_ingest/test-ingest-against-api.sh
|
||||
./test_unstructured_ingest/test-ingest-gcs.sh
|
||||
./test_unstructured_ingest/test-ingest-onedrive.sh
|
||||
./test_unstructured_ingest/test-ingest-elasticsearch.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file.sh
|
||||
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.8.2-dev0" # pragma: no cover
|
||||
__version__ = "0.8.2-dev1" # pragma: no cover
|
||||
|
158
unstructured/ingest/connector/onedrive.py
Normal file
158
unstructured/ingest/connector/onedrive.py
Normal file
@ -0,0 +1,158 @@
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, List
|
||||
|
||||
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
ConnectorCleanupMixin,
|
||||
IngestDocCleanupMixin,
|
||||
StandardConnectorConfig,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem
|
||||
|
||||
MAX_MB_SIZE = 512_000_000
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleOneDriveConfig(BaseConnectorConfig):
|
||||
client_id: str
|
||||
client_credential: str = field(repr=False)
|
||||
user_pname: str
|
||||
tenant: str = field(repr=False)
|
||||
authority_url: str = field(repr=False)
|
||||
folder: str = field(default="")
|
||||
recursive: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if not (self.client_id and self.client_credential and self.user_pname):
|
||||
raise ValueError(
|
||||
"Please provide one of the following mandatory values:"
|
||||
"\n-ms-client_id\n-ms-client_cred\n-ms-user-pname",
|
||||
)
|
||||
self.token_factory = self._acquire_token
|
||||
|
||||
@requires_dependencies(["msal"])
|
||||
def _acquire_token(self):
|
||||
from msal import ConfidentialClientApplication
|
||||
|
||||
try:
|
||||
app = ConfidentialClientApplication(
|
||||
authority=f"{self.authority_url}/{self.tenant}",
|
||||
client_id=self.client_id,
|
||||
client_credential=self.client_credential,
|
||||
)
|
||||
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
except ValueError as exc:
|
||||
logger.error("Couldn't set up credentials for OneDrive")
|
||||
raise exc
|
||||
return token
|
||||
|
||||
|
||||
@dataclass
|
||||
class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
config: SimpleOneDriveConfig
|
||||
file: "DriveItem"
|
||||
|
||||
def __post_init__(self):
|
||||
self.ext = "".join(Path(self.file.name).suffixes)
|
||||
if not self.ext:
|
||||
raise ValueError("Unsupported file without extension.")
|
||||
|
||||
if self.ext not in EXT_TO_FILETYPE.keys():
|
||||
raise ValueError(
|
||||
f"Extension not supported. "
|
||||
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
|
||||
)
|
||||
self._set_download_paths()
|
||||
|
||||
def _set_download_paths(self) -> None:
|
||||
"""Parses the folder structure from the source and creates the download and output paths"""
|
||||
download_path = Path(f"{self.standard_config.download_dir}")
|
||||
output_path = Path(f"{self.standard_config.output_dir}")
|
||||
|
||||
if parent_ref := self.file.get_property("parentReference", "").path.split(":")[-1]:
|
||||
odir = parent_ref[1:] if parent_ref[0] == "/" else parent_ref
|
||||
download_path = download_path if odir == "" else (download_path / odir).resolve()
|
||||
output_path = output_path if odir == "" else (output_path / odir).resolve()
|
||||
|
||||
self.download_dir = download_path
|
||||
self.download_filepath = (download_path / self.file.name).resolve()
|
||||
oname = f"{self.file.name[:-len(self.ext)]}.json"
|
||||
self.output_dir = output_path
|
||||
self.output_filepath = (output_path / oname).resolve()
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return Path(self.download_filepath).resolve()
|
||||
|
||||
@property
|
||||
def _output_filename(self):
|
||||
return Path(self.output_filepath).resolve()
|
||||
|
||||
@BaseIngestDoc.skip_if_file_exists
|
||||
@requires_dependencies(["office365"])
|
||||
def get_file(self):
|
||||
try:
|
||||
fsize = self.file.get_property("size", 0)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not self.download_dir.is_dir():
|
||||
logger.debug(f"Creating directory: {self.download_dir}")
|
||||
self.download_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if fsize > MAX_MB_SIZE:
|
||||
logger.info(f"Downloading file with size: {fsize} bytes in chunks")
|
||||
with self.filename.open(mode="wb") as f:
|
||||
self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
||||
else:
|
||||
with self.filename.open(mode="wb") as f:
|
||||
self.file.download(f).execute_query()
|
||||
except Exception as e:
|
||||
logger.error(f"Error while downloading and saving file: {self.filename}.")
|
||||
logger.error(e)
|
||||
return
|
||||
logger.info(f"File downloaded: {self.filename}")
|
||||
return
|
||||
|
||||
|
||||
class OneDriveConnector(ConnectorCleanupMixin, BaseConnector):
|
||||
config: SimpleOneDriveConfig
|
||||
|
||||
def __init__(self, standard_config: StandardConnectorConfig, config: SimpleOneDriveConfig):
|
||||
super().__init__(standard_config, config)
|
||||
self._set_client()
|
||||
|
||||
@requires_dependencies(["office365"])
|
||||
def _set_client(self):
|
||||
from office365.graph_client import GraphClient
|
||||
|
||||
self.client = GraphClient(self.config.token_factory)
|
||||
|
||||
def _list_objects(self, folder, recursive) -> List["DriveItem"]:
|
||||
drive_items = folder.children.get().execute_query()
|
||||
files = [d for d in drive_items if d.is_file]
|
||||
if not recursive:
|
||||
return files
|
||||
folders = [d for d in drive_items if d.is_folder]
|
||||
for f in folders:
|
||||
files += self._list_objects(f, recursive)
|
||||
return files
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
def get_ingest_docs(self):
|
||||
root = self.client.users[self.config.user_pname].drive.get().execute_query().root
|
||||
if fpath := self.config.folder:
|
||||
root = root.get_by_path(fpath).get().execute_query()
|
||||
if root is None or not root.is_folder:
|
||||
raise ValueError(f"Unable to find directory, given: {fpath}")
|
||||
files = self._list_objects(root, self.config.recursive)
|
||||
return [OneDriveIngestDoc(self.standard_config, self.config, f) for f in files]
|
@ -391,6 +391,37 @@ class MainProcess:
|
||||
default=None,
|
||||
help="Number of days to go back in the history of discord channels, must be an number",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-client-id",
|
||||
default=None,
|
||||
help="Microsoft app client ID",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-client-cred",
|
||||
default=None,
|
||||
help="Microsoft App client secret",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-authority-url",
|
||||
default="https://login.microsoftonline.com",
|
||||
help="Authentication token provider for Microsoft apps, default is "
|
||||
"https://login.microsoftonline.com",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-tenant",
|
||||
default="common",
|
||||
help="ID or domain name associated with your Azure AD instance",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-user-pname",
|
||||
default=None,
|
||||
help="User principal name, usually is your Azure AD email.",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-onedrive-folder",
|
||||
default=None,
|
||||
help="Folder to start parsing files from.",
|
||||
)
|
||||
@click.option(
|
||||
"--elasticsearch-url",
|
||||
default=None,
|
||||
@ -488,6 +519,12 @@ def main(
|
||||
discord_channels,
|
||||
discord_token,
|
||||
discord_period,
|
||||
ms_client_id,
|
||||
ms_client_cred,
|
||||
ms_authority_url,
|
||||
ms_tenant,
|
||||
ms_user_pname,
|
||||
ms_onedrive_folder,
|
||||
elasticsearch_url,
|
||||
elasticsearch_index_name,
|
||||
jq_query,
|
||||
@ -590,6 +627,10 @@ def main(
|
||||
hashed_dir_name = hashlib.sha256(
|
||||
f"{elasticsearch_url}_{elasticsearch_index_name}".encode("utf-8"),
|
||||
)
|
||||
elif ms_user_pname:
|
||||
hashed_dir_name = hashlib.sha256(
|
||||
f"{ms_tenant}_{ms_user_pname}".encode("utf-8"),
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"This connector does not support saving downloads to ~/.cache/ ,"
|
||||
@ -815,6 +856,25 @@ def main(
|
||||
decay=biomed_decay,
|
||||
),
|
||||
)
|
||||
elif ms_client_id or ms_user_pname:
|
||||
from unstructured.ingest.connector.onedrive import (
|
||||
OneDriveConnector,
|
||||
SimpleOneDriveConfig,
|
||||
)
|
||||
|
||||
doc_connector = OneDriveConnector( # type: ignore
|
||||
standard_config=standard_config,
|
||||
config=SimpleOneDriveConfig(
|
||||
client_id=ms_client_id,
|
||||
client_credential=ms_client_cred,
|
||||
user_pname=ms_user_pname,
|
||||
tenant=ms_tenant,
|
||||
authority_url=ms_authority_url,
|
||||
folder=ms_onedrive_folder,
|
||||
recursive=recursive,
|
||||
),
|
||||
)
|
||||
|
||||
elif local_input_path:
|
||||
from unstructured.ingest.connector.local import (
|
||||
LocalConnector,
|
||||
|
Loading…
x
Reference in New Issue
Block a user