diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8f80b2bc6..6e6c74b3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -188,6 +188,8 @@ jobs: SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} + MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} + MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} run: | source .venv/bin/activate @@ -206,6 +208,7 @@ jobs: make install-ingest-google-drive make install-ingest-github make install-ingest-gitlab + make install-ingest-onedrive make install-ingest-slack make install-ingest-wikipedia ./test_unstructured_ingest/test-ingest.sh diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 2288bc0cf..5e85ce3d9 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -63,6 +63,8 @@ jobs: DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} + MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} OVERWRITE_FIXTURES: "true" run: | source .venv/bin/activate @@ -81,6 +83,7 @@ jobs: make install-ingest-google-drive make install-ingest-github make install-ingest-gitlab + make install-ingest-onedrive make install-ingest-slack make install-ingest-wikipedia ./test_unstructured_ingest/test-ingest.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index ce6cdf788..268370931 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.8.2-dev1 + +### Enhancements + +### Features + +### Fixes + +* Adds Onedrive connector. + ## 0.8.2-dev0 ### Enhancements diff --git a/Makefile b/Makefile index 3c498e6f9..60dd5bf88 100644 --- a/Makefile +++ b/Makefile @@ -86,6 +86,10 @@ install-ingest-github: install-ingest-gitlab: python3 -m pip install -r requirements/ingest-gitlab.txt +.PHONY: install-ingest-onedrive +install-ingest-onedrive: + python3 -m pip install -r requirements/ingest-onedrive.txt + .PHONY: install-ingest-reddit install-ingest-reddit: python3 -m pip install -r requirements/ingest-reddit.txt diff --git a/examples/ingest/onedrive/onedrive.sh b/examples/ingest/onedrive/onedrive.sh new file mode 100755 index 000000000..3b5796724 --- /dev/null +++ b/examples/ingest/onedrive/onedrive.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Processes the Unstructured-IO/unstructured repository +# through Unstructured's library in 2 processes. + +# Structured outputs are stored in onedrive-ingest-output/ + +# NOTE, this script is not ready-to-run! +# You must enter a Azure AD app client-id, client secret and user principal name +# before running. + +# To get the credentials for your Azure AD app, follow these steps: +# https://learn.microsoft.com/en-us/graph/auth-register-app-v2 +# https://learn.microsoft.com/en-us/graph/auth-v2-service + +# Assign the neccesary permissions for the application to read from OneDrive. +# https://learn.microsoft.com/en-us/graph/permissions-reference + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "$SCRIPT_DIR"/../../.. || exit 1 + +PYTHONPATH=. ./unstructured/ingest/main.py \ + --ms-client-id "" \ + --ms-client-cred "" \ + --ms-authority-url "" \ + --ms-tenant "" \ + --ms-user-pname "" \ + --structured-output-dir onedrive-ingest-output \ + --num-processes 2 \ + --verbose diff --git a/requirements/ingest-onedrive.in b/requirements/ingest-onedrive.in new file mode 100644 index 000000000..12e5f3cbc --- /dev/null +++ b/requirements/ingest-onedrive.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt +msal +Office365-REST-Python-Client \ No newline at end of file diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt new file mode 100644 index 000000000..01bd97064 --- /dev/null +++ b/requirements/ingest-onedrive.txt @@ -0,0 +1,54 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile ingest-onedrive.in +# +certifi==2023.5.7 + # via + # -c base.txt + # -c constraints.in + # requests +cffi==1.15.1 + # via + # -c base.txt + # cryptography +charset-normalizer==3.2.0 + # via + # -c base.txt + # requests +cryptography==41.0.1 + # via + # -c base.txt + # msal + # pyjwt +idna==3.4 + # via + # -c base.txt + # requests +msal==1.22.0 + # via + # -r ingest-onedrive.in + # office365-rest-python-client +office365-rest-python-client==2.4.1 + # via -r ingest-onedrive.in +pycparser==2.21 + # via + # -c base.txt + # cffi +pyjwt[crypto]==2.7.0 + # via msal +pytz==2023.3 + # via + # -c base.txt + # office365-rest-python-client +requests==2.31.0 + # via + # -c base.txt + # msal + # office365-rest-python-client +urllib3==1.26.16 + # via + # -c base.txt + # -c constraints.in + # requests diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/fake-text.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/fake-text.json new file mode 100644 index 000000000..2acee3a57 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/fake-text.json @@ -0,0 +1,62 @@ +[ + { + "type": "NarrativeText", + "element_id": "1df8eeb8be847c3a1a7411e3be3e0396", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "This is a test document to use for unit tests." + }, + { + "type": "Address", + "element_id": "a9d4657034aa3fdb5177f1325e912362", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Doylestown, PA 18901" + }, + { + "type": "Title", + "element_id": "9c218520320f238595f1fde74bdd137d", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Important points:" + }, + { + "type": "ListItem", + "element_id": "39a3ae572581d0f1fe7511fd7b3aa414", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Hamburgers are delicious" + }, + { + "type": "ListItem", + "element_id": "fc1adcb8eaceac694e500a103f9f698f", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Dogs are the best" + }, + { + "type": "ListItem", + "element_id": "0b61e826b1c4ab05750184da72b89f83", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "I love fuzzy blankets" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/nested/fake-text.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/nested/fake-text.json new file mode 100644 index 000000000..2acee3a57 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/nested/fake-text.json @@ -0,0 +1,62 @@ +[ + { + "type": "NarrativeText", + "element_id": "1df8eeb8be847c3a1a7411e3be3e0396", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "This is a test document to use for unit tests." + }, + { + "type": "Address", + "element_id": "a9d4657034aa3fdb5177f1325e912362", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Doylestown, PA 18901" + }, + { + "type": "Title", + "element_id": "9c218520320f238595f1fde74bdd137d", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Important points:" + }, + { + "type": "ListItem", + "element_id": "39a3ae572581d0f1fe7511fd7b3aa414", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Hamburgers are delicious" + }, + { + "type": "ListItem", + "element_id": "fc1adcb8eaceac694e500a103f9f698f", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "Dogs are the best" + }, + { + "type": "ListItem", + "element_id": "0b61e826b1c4ab05750184da72b89f83", + "metadata": { + "data_source": {}, + "filename": "fake-text.txt", + "filetype": "text/plain" + }, + "text": "I love fuzzy blankets" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json new file mode 100644 index 000000000..f3cec2669 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json @@ -0,0 +1,41 @@ +[ + { + "type": "Table", + "element_id": "b3e92c24311471ee2c4884b010dd55a0", + "metadata": { + "data_source": {}, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "page_number": 1, + "page_name": "Example Test", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
" + }, + "text": "\n \n \n MA\n What C datatypes are 8 bits? (assume i386)\n int\n \n float\n \n double\n \n char\n \n \n TF\n Bagpipes are awesome.\n true\n \n \n \n \n \n \n \n \n ESS\n How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n \n \n \n \n \n \n \n \n \n ORD\n Rank the following in their order of operation.\n Parentheses\n Exponents\n Division\n Addition\n \n \n \n \n \n FIB\n The student activities fee is\n 95\n dollars for students enrolled in\n 19\n units or more,\n \n \n \n \n \n MAT\n Match the lower-case greek letter with its capital form.\n λ\n Λ\n α\n γ\n Γ\n φ\n Φ\n \n \n" + }, + { + "type": "Table", + "element_id": "adf2eb068afa00f6dfaa4adf8195ce25", + "metadata": { + "data_source": {}, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "page_number": 2, + "page_name": "Format Abbr.", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
http://www.cmu.edu/blackboard
Question Format Abbreviations
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
" + }, + "text": "\n \n \n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n \n \n Question Format Abbreviations\n \n \n \n \n \n \n \n Abbreviation\n Question Type\n \n \n MC\n Multiple Choice\n \n \n MA\n Multiple Answer\n \n \n TF\n True/False\n \n \n ESS\n Essay\n \n \n ORD\n Ordering\n \n \n MAT\n Matching\n \n \n FIB\n Fill in the Blank\n \n \n FIL\n File response\n \n \n NUM\n Numeric Response\n \n \n SR\n Short response\n \n \n OP\n Opinion\n \n \n FIB_PLUS\n Multiple Fill in the Blank\n \n \n JUMBLED_SENTENCE\n Jumbled Sentence\n \n \n QUIZ_BOWL\n Quiz Bowl\n \n \n" + }, + { + "type": "Table", + "element_id": "55c06f516945f32a0187cfd94ba7e074", + "metadata": { + "data_source": {}, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "page_number": 3, + "page_name": "Readme", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
http://www.cmu.edu/blackboard
File Information
Source
http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls
Version
1.0 (January 2012)
Contact
bb-help@andrew.cmu.edu
About
This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions
" + }, + "text": "\n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n File Information\n \n \n \n \n \n \n \n \n Source\n \n \n http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n \n \n \n \n \n \n \n \n Version\n \n \n 1.0 (January 2012)\n \n \n \n \n \n \n \n \n Contact\n \n \n bb-help@andrew.cmu.edu\n \n \n \n \n \n \n \n \n About\n \n \n This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n \n \n" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh new file mode 100755 index 000000000..78747509e --- /dev/null +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=onedrive +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME + +if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ]; then + echo "Skipping OneDrive ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED env var is not set." + exit 0 +fi + +PYTHONPATH=. ./unstructured/ingest/main.py \ + --download-dir "$DOWNLOAD_DIR" \ + --ms-client-cred "$MS_CLIENT_CRED" \ + --ms-client-id "$MS_CLIENT_ID" \ + --ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \ + --ms-user-pname "devops@unstructuredio.onmicrosoft.com" \ + --ms-onedrive-folder '/utic-test-ingest-fixtures' \ + --metadata-exclude file_directory,metadata.data_source.date_processed \ + --num-processes 2 \ + --partition-strategy hi_res \ + --preserve-downloads \ + --recursive \ + --reprocess \ + --structured-output-dir "$OUTPUT_DIR" + +sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index df9fbf197..40cc489ea 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -21,6 +21,7 @@ export OMP_THREAD_LIMIT=1 ./test_unstructured_ingest/test-ingest-slack.sh ./test_unstructured_ingest/test-ingest-against-api.sh ./test_unstructured_ingest/test-ingest-gcs.sh +./test_unstructured_ingest/test-ingest-onedrive.sh ./test_unstructured_ingest/test-ingest-elasticsearch.sh ./test_unstructured_ingest/test-ingest-local-single-file.sh # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 22e2f3ff0..840834c4e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.2-dev0" # pragma: no cover +__version__ = "0.8.2-dev1" # pragma: no cover diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py new file mode 100644 index 000000000..ca1a5757a --- /dev/null +++ b/unstructured/ingest/connector/onedrive.py @@ -0,0 +1,158 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING, List + +from unstructured.file_utils.filetype import EXT_TO_FILETYPE +from unstructured.ingest.interfaces import ( + BaseConnector, + BaseConnectorConfig, + BaseIngestDoc, + ConnectorCleanupMixin, + IngestDocCleanupMixin, + StandardConnectorConfig, +) +from unstructured.ingest.logger import logger +from unstructured.utils import requires_dependencies + +if TYPE_CHECKING: + from office365.onedrive.driveitems.driveItem import DriveItem + +MAX_MB_SIZE = 512_000_000 + + +@dataclass +class SimpleOneDriveConfig(BaseConnectorConfig): + client_id: str + client_credential: str = field(repr=False) + user_pname: str + tenant: str = field(repr=False) + authority_url: str = field(repr=False) + folder: str = field(default="") + recursive: bool = False + + def __post_init__(self): + if not (self.client_id and self.client_credential and self.user_pname): + raise ValueError( + "Please provide one of the following mandatory values:" + "\n-ms-client_id\n-ms-client_cred\n-ms-user-pname", + ) + self.token_factory = self._acquire_token + + @requires_dependencies(["msal"]) + def _acquire_token(self): + from msal import ConfidentialClientApplication + + try: + app = ConfidentialClientApplication( + authority=f"{self.authority_url}/{self.tenant}", + client_id=self.client_id, + client_credential=self.client_credential, + ) + token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + except ValueError as exc: + logger.error("Couldn't set up credentials for OneDrive") + raise exc + return token + + +@dataclass +class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): + config: SimpleOneDriveConfig + file: "DriveItem" + + def __post_init__(self): + self.ext = "".join(Path(self.file.name).suffixes) + if not self.ext: + raise ValueError("Unsupported file without extension.") + + if self.ext not in EXT_TO_FILETYPE.keys(): + raise ValueError( + f"Extension not supported. " + f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", + ) + self._set_download_paths() + + def _set_download_paths(self) -> None: + """Parses the folder structure from the source and creates the download and output paths""" + download_path = Path(f"{self.standard_config.download_dir}") + output_path = Path(f"{self.standard_config.output_dir}") + + if parent_ref := self.file.get_property("parentReference", "").path.split(":")[-1]: + odir = parent_ref[1:] if parent_ref[0] == "/" else parent_ref + download_path = download_path if odir == "" else (download_path / odir).resolve() + output_path = output_path if odir == "" else (output_path / odir).resolve() + + self.download_dir = download_path + self.download_filepath = (download_path / self.file.name).resolve() + oname = f"{self.file.name[:-len(self.ext)]}.json" + self.output_dir = output_path + self.output_filepath = (output_path / oname).resolve() + + @property + def filename(self): + return Path(self.download_filepath).resolve() + + @property + def _output_filename(self): + return Path(self.output_filepath).resolve() + + @BaseIngestDoc.skip_if_file_exists + @requires_dependencies(["office365"]) + def get_file(self): + try: + fsize = self.file.get_property("size", 0) + self.output_dir.mkdir(parents=True, exist_ok=True) + + if not self.download_dir.is_dir(): + logger.debug(f"Creating directory: {self.download_dir}") + self.download_dir.mkdir(parents=True, exist_ok=True) + + if fsize > MAX_MB_SIZE: + logger.info(f"Downloading file with size: {fsize} bytes in chunks") + with self.filename.open(mode="wb") as f: + self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() + else: + with self.filename.open(mode="wb") as f: + self.file.download(f).execute_query() + except Exception as e: + logger.error(f"Error while downloading and saving file: {self.filename}.") + logger.error(e) + return + logger.info(f"File downloaded: {self.filename}") + return + + +class OneDriveConnector(ConnectorCleanupMixin, BaseConnector): + config: SimpleOneDriveConfig + + def __init__(self, standard_config: StandardConnectorConfig, config: SimpleOneDriveConfig): + super().__init__(standard_config, config) + self._set_client() + + @requires_dependencies(["office365"]) + def _set_client(self): + from office365.graph_client import GraphClient + + self.client = GraphClient(self.config.token_factory) + + def _list_objects(self, folder, recursive) -> List["DriveItem"]: + drive_items = folder.children.get().execute_query() + files = [d for d in drive_items if d.is_file] + if not recursive: + return files + folders = [d for d in drive_items if d.is_folder] + for f in folders: + files += self._list_objects(f, recursive) + return files + + def initialize(self): + pass + + def get_ingest_docs(self): + root = self.client.users[self.config.user_pname].drive.get().execute_query().root + if fpath := self.config.folder: + root = root.get_by_path(fpath).get().execute_query() + if root is None or not root.is_folder: + raise ValueError(f"Unable to find directory, given: {fpath}") + files = self._list_objects(root, self.config.recursive) + return [OneDriveIngestDoc(self.standard_config, self.config, f) for f in files] diff --git a/unstructured/ingest/main.py b/unstructured/ingest/main.py index d6a433de5..01c30c338 100755 --- a/unstructured/ingest/main.py +++ b/unstructured/ingest/main.py @@ -391,6 +391,37 @@ class MainProcess: default=None, help="Number of days to go back in the history of discord channels, must be an number", ) +@click.option( + "--ms-client-id", + default=None, + help="Microsoft app client ID", +) +@click.option( + "--ms-client-cred", + default=None, + help="Microsoft App client secret", +) +@click.option( + "--ms-authority-url", + default="https://login.microsoftonline.com", + help="Authentication token provider for Microsoft apps, default is " + "https://login.microsoftonline.com", +) +@click.option( + "--ms-tenant", + default="common", + help="ID or domain name associated with your Azure AD instance", +) +@click.option( + "--ms-user-pname", + default=None, + help="User principal name, usually is your Azure AD email.", +) +@click.option( + "--ms-onedrive-folder", + default=None, + help="Folder to start parsing files from.", +) @click.option( "--elasticsearch-url", default=None, @@ -488,6 +519,12 @@ def main( discord_channels, discord_token, discord_period, + ms_client_id, + ms_client_cred, + ms_authority_url, + ms_tenant, + ms_user_pname, + ms_onedrive_folder, elasticsearch_url, elasticsearch_index_name, jq_query, @@ -590,6 +627,10 @@ def main( hashed_dir_name = hashlib.sha256( f"{elasticsearch_url}_{elasticsearch_index_name}".encode("utf-8"), ) + elif ms_user_pname: + hashed_dir_name = hashlib.sha256( + f"{ms_tenant}_{ms_user_pname}".encode("utf-8"), + ) else: raise ValueError( "This connector does not support saving downloads to ~/.cache/ ," @@ -815,6 +856,25 @@ def main( decay=biomed_decay, ), ) + elif ms_client_id or ms_user_pname: + from unstructured.ingest.connector.onedrive import ( + OneDriveConnector, + SimpleOneDriveConfig, + ) + + doc_connector = OneDriveConnector( # type: ignore + standard_config=standard_config, + config=SimpleOneDriveConfig( + client_id=ms_client_id, + client_credential=ms_client_cred, + user_pname=ms_user_pname, + tenant=ms_tenant, + authority_url=ms_authority_url, + folder=ms_onedrive_folder, + recursive=recursive, + ), + ) + elif local_input_path: from unstructured.ingest.connector.local import ( LocalConnector,