feat: add OneDrive connector (#834)

This commit is contained in:
rvztz 2023-07-13 14:57:54 -06:00 committed by GitHub
parent 26da51c765
commit ce20c3f2bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 524 additions and 1 deletions

View File

@ -188,6 +188,8 @@ jobs:
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: | run: |
source .venv/bin/activate source .venv/bin/activate
@ -206,6 +208,7 @@ jobs:
make install-ingest-google-drive make install-ingest-google-drive
make install-ingest-github make install-ingest-github
make install-ingest-gitlab make install-ingest-gitlab
make install-ingest-onedrive
make install-ingest-slack make install-ingest-slack
make install-ingest-wikipedia make install-ingest-wikipedia
./test_unstructured_ingest/test-ingest.sh ./test_unstructured_ingest/test-ingest.sh

View File

@ -63,6 +63,8 @@ jobs:
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
OVERWRITE_FIXTURES: "true" OVERWRITE_FIXTURES: "true"
run: | run: |
source .venv/bin/activate source .venv/bin/activate
@ -81,6 +83,7 @@ jobs:
make install-ingest-google-drive make install-ingest-google-drive
make install-ingest-github make install-ingest-github
make install-ingest-gitlab make install-ingest-gitlab
make install-ingest-onedrive
make install-ingest-slack make install-ingest-slack
make install-ingest-wikipedia make install-ingest-wikipedia
./test_unstructured_ingest/test-ingest.sh ./test_unstructured_ingest/test-ingest.sh

View File

@ -1,3 +1,13 @@
## 0.8.2-dev1
### Enhancements
### Features
### Fixes
* Adds Onedrive connector.
## 0.8.2-dev0 ## 0.8.2-dev0
### Enhancements ### Enhancements

View File

@ -86,6 +86,10 @@ install-ingest-github:
install-ingest-gitlab: install-ingest-gitlab:
python3 -m pip install -r requirements/ingest-gitlab.txt python3 -m pip install -r requirements/ingest-gitlab.txt
.PHONY: install-ingest-onedrive
install-ingest-onedrive:
python3 -m pip install -r requirements/ingest-onedrive.txt
.PHONY: install-ingest-reddit .PHONY: install-ingest-reddit
install-ingest-reddit: install-ingest-reddit:
python3 -m pip install -r requirements/ingest-reddit.txt python3 -m pip install -r requirements/ingest-reddit.txt

View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Processes the Unstructured-IO/unstructured repository
# through Unstructured's library in 2 processes.
# Structured outputs are stored in onedrive-ingest-output/
# NOTE, this script is not ready-to-run!
# You must enter a Azure AD app client-id, client secret and user principal name
# before running.
# To get the credentials for your Azure AD app, follow these steps:
# https://learn.microsoft.com/en-us/graph/auth-register-app-v2
# https://learn.microsoft.com/en-us/graph/auth-v2-service
# Assign the neccesary permissions for the application to read from OneDrive.
# https://learn.microsoft.com/en-us/graph/permissions-reference
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--ms-client-id "<Azure AD app client-id>" \
--ms-client-cred "<Azure AD app client-secret>" \
--ms-authority-url "<Authority URL, default is https://login.microsoftonline.com>" \
--ms-tenant "<Azure AD tenant_id, default is 'common'>" \
--ms-user-pname "<Azure AD principal name, in most cases is the email linked to the drive>" \
--structured-output-dir onedrive-ingest-output \
--num-processes 2 \
--verbose

View File

@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
msal
Office365-REST-Python-Client

View File

@ -0,0 +1,54 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile ingest-onedrive.in
#
certifi==2023.5.7
# via
# -c base.txt
# -c constraints.in
# requests
cffi==1.15.1
# via
# -c base.txt
# cryptography
charset-normalizer==3.2.0
# via
# -c base.txt
# requests
cryptography==41.0.1
# via
# -c base.txt
# msal
# pyjwt
idna==3.4
# via
# -c base.txt
# requests
msal==1.22.0
# via
# -r ingest-onedrive.in
# office365-rest-python-client
office365-rest-python-client==2.4.1
# via -r ingest-onedrive.in
pycparser==2.21
# via
# -c base.txt
# cffi
pyjwt[crypto]==2.7.0
# via msal
pytz==2023.3
# via
# -c base.txt
# office365-rest-python-client
requests==2.31.0
# via
# -c base.txt
# msal
# office365-rest-python-client
urllib3==1.26.16
# via
# -c base.txt
# -c constraints.in
# requests

View File

@ -0,0 +1,62 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,62 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {},
"filename": "fake-text.txt",
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,41 @@
[
{
"type": "Table",
"element_id": "b3e92c24311471ee2c4884b010dd55a0",
"metadata": {
"data_source": {},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"page_number": 1,
"page_name": "Example Test",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n MA\n What C datatypes are 8 bits? (assume i386)\n int\n \n float\n \n double\n \n char\n \n \n TF\n Bagpipes are awesome.\n true\n \n \n \n \n \n \n \n \n ESS\n How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n \n \n \n \n \n \n \n \n \n ORD\n Rank the following in their order of operation.\n Parentheses\n Exponents\n Division\n Addition\n \n \n \n \n \n FIB\n The student activities fee is\n 95\n dollars for students enrolled in\n 19\n units or more,\n \n \n \n \n \n MAT\n Match the lower-case greek letter with its capital form.\n λ\n Λ\n α\n γ\n Γ\n φ\n Φ\n \n \n"
},
{
"type": "Table",
"element_id": "adf2eb068afa00f6dfaa4adf8195ce25",
"metadata": {
"data_source": {},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"page_number": 2,
"page_name": "Format Abbr.",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Question Format Abbreviations</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n \n \n Question Format Abbreviations\n \n \n \n \n \n \n \n Abbreviation\n Question Type\n \n \n MC\n Multiple Choice\n \n \n MA\n Multiple Answer\n \n \n TF\n True/False\n \n \n ESS\n Essay\n \n \n ORD\n Ordering\n \n \n MAT\n Matching\n \n \n FIB\n Fill in the Blank\n \n \n FIL\n File response\n \n \n NUM\n Numeric Response\n \n \n SR\n Short response\n \n \n OP\n Opinion\n \n \n FIB_PLUS\n Multiple Fill in the Blank\n \n \n JUMBLED_SENTENCE\n Jumbled Sentence\n \n \n QUIZ_BOWL\n Quiz Bowl\n \n \n"
},
{
"type": "Table",
"element_id": "55c06f516945f32a0187cfd94ba7e074",
"metadata": {
"data_source": {},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"page_number": 3,
"page_name": "Readme",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>File Information</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Source</td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Version</td>\n </tr>\n <tr>\n <td>1.0 (January 2012)</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Contact</td>\n </tr>\n <tr>\n <td>bb-help@andrew.cmu.edu</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>About</td>\n </tr>\n <tr>\n <td>This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n File Information\n \n \n \n \n \n \n \n \n Source\n \n \n http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n \n \n \n \n \n \n \n \n Version\n \n \n 1.0 (January 2012)\n \n \n \n \n \n \n \n \n Contact\n \n \n bb-help@andrew.cmu.edu\n \n \n \n \n \n \n \n \n About\n \n \n This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n \n \n"
}
]

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=onedrive
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ]; then
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED env var is not set."
exit 0
fi
PYTHONPATH=. ./unstructured/ingest/main.py \
--download-dir "$DOWNLOAD_DIR" \
--ms-client-cred "$MS_CLIENT_CRED" \
--ms-client-id "$MS_CLIENT_ID" \
--ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \
--ms-user-pname "devops@unstructuredio.onmicrosoft.com" \
--ms-onedrive-folder '/utic-test-ingest-fixtures' \
--metadata-exclude file_directory,metadata.data_source.date_processed \
--num-processes 2 \
--partition-strategy hi_res \
--preserve-downloads \
--recursive \
--reprocess \
--structured-output-dir "$OUTPUT_DIR"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -21,6 +21,7 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-slack.sh ./test_unstructured_ingest/test-ingest-slack.sh
./test_unstructured_ingest/test-ingest-against-api.sh ./test_unstructured_ingest/test-ingest-against-api.sh
./test_unstructured_ingest/test-ingest-gcs.sh ./test_unstructured_ingest/test-ingest-gcs.sh
./test_unstructured_ingest/test-ingest-onedrive.sh
./test_unstructured_ingest/test-ingest-elasticsearch.sh ./test_unstructured_ingest/test-ingest-elasticsearch.sh
./test_unstructured_ingest/test-ingest-local-single-file.sh ./test_unstructured_ingest/test-ingest-local-single-file.sh
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option

View File

@ -1 +1 @@
__version__ = "0.8.2-dev0" # pragma: no cover __version__ = "0.8.2-dev1" # pragma: no cover

View File

@ -0,0 +1,158 @@
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, List
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
BaseIngestDoc,
ConnectorCleanupMixin,
IngestDocCleanupMixin,
StandardConnectorConfig,
)
from unstructured.ingest.logger import logger
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from office365.onedrive.driveitems.driveItem import DriveItem
MAX_MB_SIZE = 512_000_000
@dataclass
class SimpleOneDriveConfig(BaseConnectorConfig):
client_id: str
client_credential: str = field(repr=False)
user_pname: str
tenant: str = field(repr=False)
authority_url: str = field(repr=False)
folder: str = field(default="")
recursive: bool = False
def __post_init__(self):
if not (self.client_id and self.client_credential and self.user_pname):
raise ValueError(
"Please provide one of the following mandatory values:"
"\n-ms-client_id\n-ms-client_cred\n-ms-user-pname",
)
self.token_factory = self._acquire_token
@requires_dependencies(["msal"])
def _acquire_token(self):
from msal import ConfidentialClientApplication
try:
app = ConfidentialClientApplication(
authority=f"{self.authority_url}/{self.tenant}",
client_id=self.client_id,
client_credential=self.client_credential,
)
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
except ValueError as exc:
logger.error("Couldn't set up credentials for OneDrive")
raise exc
return token
@dataclass
class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
config: SimpleOneDriveConfig
file: "DriveItem"
def __post_init__(self):
self.ext = "".join(Path(self.file.name).suffixes)
if not self.ext:
raise ValueError("Unsupported file without extension.")
if self.ext not in EXT_TO_FILETYPE.keys():
raise ValueError(
f"Extension not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
)
self._set_download_paths()
def _set_download_paths(self) -> None:
"""Parses the folder structure from the source and creates the download and output paths"""
download_path = Path(f"{self.standard_config.download_dir}")
output_path = Path(f"{self.standard_config.output_dir}")
if parent_ref := self.file.get_property("parentReference", "").path.split(":")[-1]:
odir = parent_ref[1:] if parent_ref[0] == "/" else parent_ref
download_path = download_path if odir == "" else (download_path / odir).resolve()
output_path = output_path if odir == "" else (output_path / odir).resolve()
self.download_dir = download_path
self.download_filepath = (download_path / self.file.name).resolve()
oname = f"{self.file.name[:-len(self.ext)]}.json"
self.output_dir = output_path
self.output_filepath = (output_path / oname).resolve()
@property
def filename(self):
return Path(self.download_filepath).resolve()
@property
def _output_filename(self):
return Path(self.output_filepath).resolve()
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["office365"])
def get_file(self):
try:
fsize = self.file.get_property("size", 0)
self.output_dir.mkdir(parents=True, exist_ok=True)
if not self.download_dir.is_dir():
logger.debug(f"Creating directory: {self.download_dir}")
self.download_dir.mkdir(parents=True, exist_ok=True)
if fsize > MAX_MB_SIZE:
logger.info(f"Downloading file with size: {fsize} bytes in chunks")
with self.filename.open(mode="wb") as f:
self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
else:
with self.filename.open(mode="wb") as f:
self.file.download(f).execute_query()
except Exception as e:
logger.error(f"Error while downloading and saving file: {self.filename}.")
logger.error(e)
return
logger.info(f"File downloaded: {self.filename}")
return
class OneDriveConnector(ConnectorCleanupMixin, BaseConnector):
config: SimpleOneDriveConfig
def __init__(self, standard_config: StandardConnectorConfig, config: SimpleOneDriveConfig):
super().__init__(standard_config, config)
self._set_client()
@requires_dependencies(["office365"])
def _set_client(self):
from office365.graph_client import GraphClient
self.client = GraphClient(self.config.token_factory)
def _list_objects(self, folder, recursive) -> List["DriveItem"]:
drive_items = folder.children.get().execute_query()
files = [d for d in drive_items if d.is_file]
if not recursive:
return files
folders = [d for d in drive_items if d.is_folder]
for f in folders:
files += self._list_objects(f, recursive)
return files
def initialize(self):
pass
def get_ingest_docs(self):
root = self.client.users[self.config.user_pname].drive.get().execute_query().root
if fpath := self.config.folder:
root = root.get_by_path(fpath).get().execute_query()
if root is None or not root.is_folder:
raise ValueError(f"Unable to find directory, given: {fpath}")
files = self._list_objects(root, self.config.recursive)
return [OneDriveIngestDoc(self.standard_config, self.config, f) for f in files]

View File

@ -391,6 +391,37 @@ class MainProcess:
default=None, default=None,
help="Number of days to go back in the history of discord channels, must be an number", help="Number of days to go back in the history of discord channels, must be an number",
) )
@click.option(
"--ms-client-id",
default=None,
help="Microsoft app client ID",
)
@click.option(
"--ms-client-cred",
default=None,
help="Microsoft App client secret",
)
@click.option(
"--ms-authority-url",
default="https://login.microsoftonline.com",
help="Authentication token provider for Microsoft apps, default is "
"https://login.microsoftonline.com",
)
@click.option(
"--ms-tenant",
default="common",
help="ID or domain name associated with your Azure AD instance",
)
@click.option(
"--ms-user-pname",
default=None,
help="User principal name, usually is your Azure AD email.",
)
@click.option(
"--ms-onedrive-folder",
default=None,
help="Folder to start parsing files from.",
)
@click.option( @click.option(
"--elasticsearch-url", "--elasticsearch-url",
default=None, default=None,
@ -488,6 +519,12 @@ def main(
discord_channels, discord_channels,
discord_token, discord_token,
discord_period, discord_period,
ms_client_id,
ms_client_cred,
ms_authority_url,
ms_tenant,
ms_user_pname,
ms_onedrive_folder,
elasticsearch_url, elasticsearch_url,
elasticsearch_index_name, elasticsearch_index_name,
jq_query, jq_query,
@ -590,6 +627,10 @@ def main(
hashed_dir_name = hashlib.sha256( hashed_dir_name = hashlib.sha256(
f"{elasticsearch_url}_{elasticsearch_index_name}".encode("utf-8"), f"{elasticsearch_url}_{elasticsearch_index_name}".encode("utf-8"),
) )
elif ms_user_pname:
hashed_dir_name = hashlib.sha256(
f"{ms_tenant}_{ms_user_pname}".encode("utf-8"),
)
else: else:
raise ValueError( raise ValueError(
"This connector does not support saving downloads to ~/.cache/ ," "This connector does not support saving downloads to ~/.cache/ ,"
@ -815,6 +856,25 @@ def main(
decay=biomed_decay, decay=biomed_decay,
), ),
) )
elif ms_client_id or ms_user_pname:
from unstructured.ingest.connector.onedrive import (
OneDriveConnector,
SimpleOneDriveConfig,
)
doc_connector = OneDriveConnector( # type: ignore
standard_config=standard_config,
config=SimpleOneDriveConfig(
client_id=ms_client_id,
client_credential=ms_client_cred,
user_pname=ms_user_pname,
tenant=ms_tenant,
authority_url=ms_authority_url,
folder=ms_onedrive_folder,
recursive=recursive,
),
)
elif local_input_path: elif local_input_path:
from unstructured.ingest.connector.local import ( from unstructured.ingest.connector.local import (
LocalConnector, LocalConnector,