From f7e46af22f50a4e239ed365f0691321375eaa27c Mon Sep 17 00:00:00 2001 From: David Potter Date: Tue, 25 Jul 2023 21:09:26 -0700 Subject: [PATCH] feat: adds Outlook connector (#939) * bonus: fixes issue with email partitioning where From field was being assigned the To field value. --- .github/workflows/ci.yml | 4 + .../ingest-test-fixtures-update-pr.yml | 4 + CHANGELOG.md | 12 + MANIFEST.in | 2 + Makefile | 4 + docs/source/connectors.rst | 36 +-- example-docs/eml/fake-email.eml | 2 +- .../onedrive/{onedrive.sh => ingest.sh} | 0 examples/ingest/outlook/ingest.sh | 31 +++ requirements/ingest-outlook.in | 5 + requirements/ingest-outlook.txt | 55 +++++ setup.py | 2 + test_unstructured/partition/test_email.py | 6 +- .../outlook/21be155fb0c95885.eml.json | 20 ++ .../outlook/497eba8c81c801c6.eml.json | 20 ++ .../outlook/4a16a411f162ebbb.eml.json | 20 ++ .../test-ingest-onedrive.sh | 4 +- .../test-ingest-outlook.sh | 30 +++ test_unstructured_ingest/test-ingest.sh | 1 + test_unstructured_ingest/unit/test_paths.py | 115 +++++++++ unstructured/__version__.py | 2 +- unstructured/ingest/connector/fsspec.py | 6 +- unstructured/ingest/connector/outlook.py | 230 ++++++++++++++++++ unstructured/ingest/main.py | 36 ++- unstructured/partition/email.py | 2 +- 25 files changed, 623 insertions(+), 26 deletions(-) rename examples/ingest/onedrive/{onedrive.sh => ingest.sh} (100%) create mode 100755 examples/ingest/outlook/ingest.sh create mode 100644 requirements/ingest-outlook.in create mode 100644 requirements/ingest-outlook.txt create mode 100644 test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json create mode 100644 test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json create mode 100644 test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json create mode 100755 test_unstructured_ingest/test-ingest-outlook.sh create mode 100644 test_unstructured_ingest/unit/test_paths.py create mode 100644 unstructured/ingest/connector/outlook.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 53fad8584..1cda04b60 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -192,7 +192,10 @@ jobs: GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} + MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} + MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} run: | source .venv/bin/activate sudo apt-get update @@ -212,6 +215,7 @@ jobs: make install-ingest-github make install-ingest-gitlab make install-ingest-onedrive + make install-ingest-outlook make install-ingest-slack make install-ingest-wikipedia ./test_unstructured_ingest/test-ingest.sh diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 4072340c6..e80f3963a 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -67,6 +67,9 @@ jobs: UNS_API_KEY: ${{ secrets.UNS_API_KEY }} MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} + MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} + MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} + MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} OVERWRITE_FIXTURES: "true" run: | source .venv/bin/activate @@ -87,6 +90,7 @@ jobs: make install-ingest-github make install-ingest-gitlab make install-ingest-onedrive + make install-ingest-outlook make install-ingest-slack make install-ingest-wikipedia ./test_unstructured_ingest/test-ingest.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 8da12fc17..0f50c7bfb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## 0.8.3 + +### Enhancements + +### Features + +* Adds Outlook connector + +### Fixes + +* Fixes issue with email partitioning where From field was being assigned the To field value. + ## 0.8.2-dev7 ### Enhancements diff --git a/MANIFEST.in b/MANIFEST.in index e40374d5b..ecc86fef8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,3 +12,5 @@ include requirements/ingest-reddit.in include requirements/ingest-slack.in include requirements/ingest-wikipedia.in include requirements/ingest-google-drive.in +include requirements/ingest-outlook.in +include requirements/ingest-onedrive.in diff --git a/Makefile b/Makefile index 5d0a8d823..a7cfb209f 100644 --- a/Makefile +++ b/Makefile @@ -90,6 +90,10 @@ install-ingest-gitlab: install-ingest-onedrive: python3 -m pip install -r requirements/ingest-onedrive.txt +.PHONY: install-ingest-outlook +install-ingest-outlook: + python3 -m pip install -r requirements/ingest-outlook.txt + .PHONY: install-ingest-reddit install-ingest-reddit: python3 -m pip install -r requirements/ingest-reddit.txt diff --git a/docs/source/connectors.rst b/docs/source/connectors.rst index f4fe0cbe6..78edee730 100644 --- a/docs/source/connectors.rst +++ b/docs/source/connectors.rst @@ -25,7 +25,7 @@ NOTE: Keep in mind that you will need to have all the appropriate extras and dep -------------------- You can batch process documents stored in your Azure Blob Container using the `Azure Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[azure]`` +To install all dependencies for this connector run: ``pip install "unstructured[azure]"`` ``BioMed Connector`` @@ -37,49 +37,49 @@ You can process `National Center for Biotechnology Information `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[discord]`` +To install all dependencies for this connector run: ``pip install "unstructured[discord]"`` ``Dropbox Connector`` ---------------------- You can batch process unstructured documents in your Dropbox by using the `Dropbox Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[dropbox]`` +To install all dependencies for this connector run: ``pip install "unstructured[dropbox]"`` ``Elasticsearch Connector`` ---------------------------- You can preprocess documents stored in Elasticsearch by using the `Elasticsearch Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[elasticsearch]`` +To install all dependencies for this connector run: ``pip install "unstructured[elasticsearch]"`` ``Google Cloud Storage Connector`` ------------------ You can batch load the files you have stored in Google Cloud Storage with the `GCS Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[gcs]`` +To install all dependencies for this connector run: ``pip install "unstructured[gcs]"`` ``Github Connector`` --------------------- You can process files in a Github repository using the `Github Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[github]`` +To install all dependencies for this connector run: ``pip install "unstructured[github]"`` ``Gitlab Connector`` --------------------- You can batch load files in a Gitlab repository using the `Gitlab Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[gitlab]`` +To install all dependencies for this connector run: ``pip install "unstructured[gitlab]"`` ``Google Drive Connector`` --------------------- You can batch process documents stored in your Google Drive with the `Google Drive Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[google-drive]`` +To install all dependencies for this connector run: ``pip install "unstructured[google-drive]"`` ``Local Connector`` @@ -89,34 +89,42 @@ You can batch load your unstructured files in a local directory for preprocessin ``OneDrive Connector`` --------------------- -You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector `_. You can find an example of how to use it `here `_. +You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector `_. You can find an example of how to use it `here `_. + +To install all dependencies for this connector run: ``pip install "unstructured[onedrive]"`` + + +``Outlook Connector`` +--------------------- +You can batch process email stored in Microsoft Outlook with the `Outlook Connector `_. You can find an example of how to use it `here `_. + +To install all dependencies for this connector run: ``pip install "unstructured[outlook]"`` -To install all dependencies for this connector run: ``pip install unstructured[onedrive]`` ``Reddit Connector`` --------------------- You can use the `Reddit Connector `_ to preprocess a Reddit thread. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[reddit]`` +To install all dependencies for this connector run: ``pip install "unstructured[reddit]"`` ``S3 Connector`` --------------------- You can process your files stored in S3 in batch using the `S3 Connector `_. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[s3]`` +To install all dependencies for this connector run: ``pip install "unstructured[s3]"`` ``Slack Connector`` --------------------- Using the `Slack Connector `_ you can batch process a channel. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[slack]`` +To install all dependencies for this connector run: ``pip install "unstructured[slack]"`` ``Wikipedia Connector`` --------------------- You can load and process a Wikipedia page using the `Wikipedia Connector `_ to preprocess for your model. You can find an example of how to use it `here `_. -To install all dependencies for this connector run: ``pip install unstructured[wikipedia]`` +To install all dependencies for this connector run: ``pip install "unstructured[wikipedia]"`` diff --git a/example-docs/eml/fake-email.eml b/example-docs/eml/fake-email.eml index 702a40852..17909dd48 100644 --- a/example-docs/eml/fake-email.eml +++ b/example-docs/eml/fake-email.eml @@ -3,7 +3,7 @@ Date: Fri, 16 Dec 2022 17:04:16 -0500 Message-ID: Subject: Test Email From: Matthew Robinson -To: Matthew Robinson +To: NotMatthew Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" --00000000000095c9b205eff92630 diff --git a/examples/ingest/onedrive/onedrive.sh b/examples/ingest/onedrive/ingest.sh similarity index 100% rename from examples/ingest/onedrive/onedrive.sh rename to examples/ingest/onedrive/ingest.sh diff --git a/examples/ingest/outlook/ingest.sh b/examples/ingest/outlook/ingest.sh new file mode 100755 index 000000000..1c3d50363 --- /dev/null +++ b/examples/ingest/outlook/ingest.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Processes Outlook emails through Unstructured's library. Does not download attachments. + +# Structured outputs are stored in outlook-output/ + +# NOTE, this script is not ready-to-run! +# You must enter a Azure AD app client-id, client secret, tenant-id, and email +# before running. + +# To get the credentials for your Azure AD app, follow these steps: +# https://learn.microsoft.com/en-us/graph/auth-register-app-v2 +# https://learn.microsoft.com/en-us/graph/auth-v2-service + +# Assign the neccesary permissions for the application to read from mail. +# https://learn.microsoft.com/en-us/graph/permissions-reference + + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "$SCRIPT_DIR"/../../.. || exit 1 + +PYTHONPATH=. ./unstructured/ingest/main.py \ + --ms-client-id "$MS_CLIENT_ID" \ + --ms-client-cred "$MS_CLIENT_CRED" \ + --ms-tenant "$MS_TENANT_ID" \ + --ms-user-email "$MS_USER_EMAIL" \ + --ms-outlook-folders Inbox,"Sent Items" \ + --structured-output-dir outlook-output \ + --num-processes 2 \ + --recursive \ + --verbose diff --git a/requirements/ingest-outlook.in b/requirements/ingest-outlook.in new file mode 100644 index 000000000..c3657aeda --- /dev/null +++ b/requirements/ingest-outlook.in @@ -0,0 +1,5 @@ +-c constraints.in +-c base.txt +msal +Office365-REST-Python-Client +cryptography==41.0.2 diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt new file mode 100644 index 000000000..b3c381d32 --- /dev/null +++ b/requirements/ingest-outlook.txt @@ -0,0 +1,55 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile requirements/ingest-outlook.in +# +certifi==2023.5.7 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +cffi==1.15.1 + # via + # -c requirements/base.txt + # cryptography +charset-normalizer==3.2.0 + # via + # -c requirements/base.txt + # requests +cryptography==41.0.2 + # via + # -c requirements/base.txt + # -r requirements/ingest-outlook.in + # msal + # pyjwt +idna==3.4 + # via + # -c requirements/base.txt + # requests +msal==1.22.0 + # via + # -r requirements/ingest-outlook.in + # office365-rest-python-client +office365-rest-python-client==2.4.2 + # via -r requirements/ingest-outlook.in +pycparser==2.21 + # via + # -c requirements/base.txt + # cffi +pyjwt[crypto]==2.7.0 + # via msal +pytz==2023.3 + # via + # -c requirements/base.txt + # office365-rest-python-client +requests==2.31.0 + # via + # -c requirements/base.txt + # msal + # office365-rest-python-client +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests diff --git a/setup.py b/setup.py index b24bae6fd..89bbac2e3 100644 --- a/setup.py +++ b/setup.py @@ -85,6 +85,8 @@ setup( "gcs": load_requirements("requirements/ingest-gcs.in"), "elasticsearch": load_requirements("requirements/ingest-elasticsearch.in"), "dropbox": load_requirements("requirements/ingest-dropbox.in"), + "onedrive": load_requirements("requirements/ingest-onedrive.in"), + "outlook": load_requirements("requirements/ingest-outlook.in"), "confluence": load_requirements("requirements/ingest-confluence.in"), }, package_dir={"unstructured": "unstructured"}, diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 7f3aea511..594029600 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -290,7 +290,7 @@ def test_partition_email_from_file_with_header(): def test_partition_email_from_filename_has_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") elements = partition_email(filename=filename) assert len(elements) > 0 assert ( @@ -302,7 +302,7 @@ def test_partition_email_from_filename_has_metadata(): page_number=None, url=None, sent_from=["Matthew Robinson "], - sent_to=["Matthew Robinson "], + sent_to=["NotMatthew "], subject="Test Email", filetype="message/rfc822", ).to_dict() @@ -310,7 +310,7 @@ def test_partition_email_from_filename_has_metadata(): expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00") assert elements[0].metadata.get_date() == expected_dt for element in elements: - assert element.metadata.filename == "fake-email-header.eml" + assert element.metadata.filename == "fake-email.eml" def test_extract_email_text_matches_html(): diff --git a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json new file mode 100644 index 000000000..0f3ce4995 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json @@ -0,0 +1,20 @@ +[ + { + "type": "Title", + "element_id": "a0f48ad299334e5716f85d225bfe2a16", + "metadata": { + "data_source": {}, + "filename": "21be155fb0c95885.eml", + "date": "2023-07-15T08:35:51-07:00", + "filetype": "message/rfc822", + "sent_from": [ + "David Potter " + ], + "sent_to": [ + "devops@unstructuredio.onmicrosoft.com" + ], + "subject": "integration test email 1" + }, + "text": "integration test email" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json new file mode 100644 index 000000000..19f39fdda --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json @@ -0,0 +1,20 @@ +[ + { + "type": "NarrativeText", + "element_id": "cebc4803f41f12981b808ffd79d7b480", + "metadata": { + "data_source": {}, + "filename": "497eba8c81c801c6.eml", + "date": "2023-07-24T18:25:52-07:00", + "filetype": "message/rfc822", + "sent_from": [ + "Ryan Nikolaidis " + ], + "sent_to": [ + "devops@unstructuredio.onmicrosoft.com" + ], + "subject": "subfolder1_1" + }, + "text": "this is a message for the subfolder1_1" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json new file mode 100644 index 000000000..d2e61bccb --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json @@ -0,0 +1,20 @@ +[ + { + "type": "NarrativeText", + "element_id": "007ec3bff83ee17497e490b86a36e0dd", + "metadata": { + "data_source": {}, + "filename": "4a16a411f162ebbb.eml", + "date": "2023-07-09T20:38:47-07:00", + "filetype": "message/rfc822", + "sent_from": [ + "David Potter " + ], + "sent_to": [ + "devops@unstructuredio.onmicrosoft.com" + ], + "subject": "message for subfolder" + }, + "text": "this is a message for the subfolder" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 78747509e..7f991806b 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -17,8 +17,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --download-dir "$DOWNLOAD_DIR" \ --ms-client-cred "$MS_CLIENT_CRED" \ --ms-client-id "$MS_CLIENT_ID" \ - --ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \ - --ms-user-pname "devops@unstructuredio.onmicrosoft.com" \ + --ms-tenant "$MS_TENANT_ID" \ + --ms-user-pname "$MS_USER_PNAME" \ --ms-onedrive-folder '/utic-test-ingest-fixtures' \ --metadata-exclude file_directory,metadata.data_source.date_processed \ --num-processes 2 \ diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh new file mode 100755 index 000000000..fa21335ac --- /dev/null +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=outlook +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME + +if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then + echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." + exit 0 +fi + +PYTHONPATH=. ./unstructured/ingest/main.py \ + --download-dir "$DOWNLOAD_DIR" \ + --ms-client-cred "$MS_CLIENT_CRED" \ + --ms-client-id "$MS_CLIENT_ID" \ + --ms-tenant "$MS_TENANT_ID" \ + --ms-user-email "$MS_USER_EMAIL" \ + --ms-outlook-folders IntegrationTest \ + --metadata-exclude file_directory,metadata.data_source.date_processed \ + --num-processes 2 \ + --preserve-downloads \ + --recursive \ + --reprocess \ + --structured-output-dir "$OUTPUT_DIR" + +sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 2d6b5acd4..b202ce455 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -22,6 +22,7 @@ export OMP_THREAD_LIMIT=1 ./test_unstructured_ingest/test-ingest-against-api.sh ./test_unstructured_ingest/test-ingest-gcs.sh ./test_unstructured_ingest/test-ingest-onedrive.sh +./test_unstructured_ingest/test-ingest-outlook.sh ./test_unstructured_ingest/test-ingest-elasticsearch.sh ./test_unstructured_ingest/test-ingest-confluence-diff.sh ./test_unstructured_ingest/test-ingest-confluence-large.sh diff --git a/test_unstructured_ingest/unit/test_paths.py b/test_unstructured_ingest/unit/test_paths.py new file mode 100644 index 000000000..52c6823d6 --- /dev/null +++ b/test_unstructured_ingest/unit/test_paths.py @@ -0,0 +1,115 @@ +from dataclasses import dataclass +from pathlib import Path + +from unstructured.ingest.connector.dropbox import ( + DropboxIngestDoc, +) +from unstructured.ingest.connector.fsspec import ( + FsspecIngestDoc, +) +from unstructured.ingest.interfaces import ( + BaseConnectorConfig, + BaseIngestDoc, + StandardConnectorConfig, +) + + +@dataclass +class FakeConfigDropboxRoot: + output_dir = "/fakeuser/fake_output" + dir_path = " " + download_dir = "/fakeuser/fake_download" + + +@dataclass +class FakeConfigFolder: + output_dir = "/fakeuser/fake_output" + dir_path = "fake_folder" + download_dir = "/fakeuser/fake_download" + + +def test_dropbox_root_succeeds(): + """Test that path joining method works for Dropbox root folder. Note slash in front of remote_file_path.""" + dbox = DropboxIngestDoc( + config=FakeConfigDropboxRoot, + standard_config=FakeConfigDropboxRoot, + remote_file_path="/fake_file.txt", + ) + output_filename = dbox._output_filename + download_filename = dbox._tmp_download_file() + + assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json") + assert download_filename == Path("/fakeuser/fake_download/fake_file.txt") + + +def test_dropbox_root_succeeds2(): + """Test that path joining method works for Dropbox root folder. Note lack of slash in front of remote_file_path. + This still works.""" + dbox = DropboxIngestDoc( + config=FakeConfigDropboxRoot, + standard_config=FakeConfigDropboxRoot, + remote_file_path="fake_file.txt", + ) + output_filename = dbox._output_filename + download_filename = dbox._tmp_download_file() + + assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json") + assert download_filename == Path("/fakeuser/fake_download/fake_file.txt") + + +def test_dropbox_folder_succeeds(): + """Test that path joining method works for Dropbox root folder. Note no slash in front of remote_file_path.""" + dbox = DropboxIngestDoc( + config=FakeConfigFolder, + standard_config=FakeConfigFolder, + remote_file_path="fake_file2.txt", + ) + output_filename = dbox._output_filename + download_filename = dbox._tmp_download_file() + + assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json") + assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt") + + +def test_dropbox_folder_fails(): + """Test that path joining method gives WRONG path. Note slash in front of remote_file_path. + Path joining is sensitive. Note that the path is MISSING the folders.""" + dbox = DropboxIngestDoc( + config=FakeConfigFolder, + standard_config=FakeConfigFolder, + remote_file_path="/fake_file2.txt", + ) + output_filename = dbox._output_filename + download_filename = dbox._tmp_download_file() + + assert output_filename == Path("/fake_file2.txt.json") + assert download_filename == Path("/fake_file2.txt") + + +def test_fsspec_folder_succeeds(): + """Test that path joining method works for root folder. Note no slash in front of remote_file_path.""" + dbox = FsspecIngestDoc( + config=FakeConfigFolder, + standard_config=FakeConfigFolder, + remote_file_path="fake_file2.txt", + ) + output_filename = dbox._output_filename + download_filename = dbox._tmp_download_file() + + assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json") + assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt") + + +def test_fsspec_folder_fails(): + """Test that path joining method gives WRONG path. Note slash in front of remote_file_path. + Path joining is sensitive. Note that the path is MISSING the folders.""" + fstest = FsspecIngestDoc( + config=FakeConfigFolder, + standard_config=FakeConfigFolder, + remote_file_path="/fake_file2.txt", + ) + output_filename = fstest._output_filename + download_filename = fstest._tmp_download_file() + + assert output_filename == Path("/fake_file2.txt.json") + assert download_filename == Path("/fake_file2.txt") \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 079a22828..54090120e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.2-dev7" # pragma: no cover +__version__ = "0.8.3" # pragma: no cover diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index 3fb258685..ad61ef115 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -166,9 +166,9 @@ class FsspecConnector(ConnectorCleanupMixin, BaseConnector): def get_ingest_docs(self): return [ self.ingest_doc_cls( - self.standard_config, - self.config, - file, + standard_config=self.standard_config, + config=self.config, + remote_file_path=file, ) for file in self._list_files() ] diff --git a/unstructured/ingest/connector/outlook.py b/unstructured/ingest/connector/outlook.py new file mode 100644 index 000000000..c06dbca8b --- /dev/null +++ b/unstructured/ingest/connector/outlook.py @@ -0,0 +1,230 @@ +import hashlib +import os +from collections import defaultdict +from dataclasses import dataclass, field +from itertools import chain +from pathlib import Path +from typing import List + +from office365.onedrive.driveitems.driveItem import DriveItem + +from unstructured.ingest.interfaces import ( + BaseConnector, + BaseConnectorConfig, + BaseIngestDoc, + ConnectorCleanupMixin, + IngestDocCleanupMixin, + StandardConnectorConfig, +) +from unstructured.ingest.logger import logger +from unstructured.utils import requires_dependencies + +MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder + + +class MissingFolderError(Exception): + """There are no root folders with those names.""" + + +@dataclass +class SimpleOutlookConfig(BaseConnectorConfig): + """This class is getting the token.""" + + client_id: str + client_credential: str = field(repr=False) + user_email: str + tenant: str = field(repr=False) + authority_url: str = field(repr=False) + ms_outlook_folders: List[str] + recursive: bool = False + + def __post_init__(self): + if not (self.client_id and self.client_credential and self.user_email): + raise ValueError( + "Please provide one of the following mandatory values:" + "\n--ms-client_id\n--ms-client_cred\n--ms-user-email", + ) + self.token_factory = self._acquire_token + + @requires_dependencies(["msal"]) + def _acquire_token(self): + from msal import ConfidentialClientApplication + + try: + app = ConfidentialClientApplication( + authority=f"{self.authority_url}/{self.tenant}", + client_id=self.client_id, + client_credential=self.client_credential, + ) + token = app.acquire_token_for_client( + scopes=["https://graph.microsoft.com/.default"], + ) + except ValueError as exc: + logger.error("Couldn't set up credentials for Outlook") + raise exc + return token + + @staticmethod + def parse_folders(folder_str: str) -> List[str]: + """Parses a comma separated string of Outlook folders into a list.""" + return [x.strip() for x in folder_str.split(",")] + + +@dataclass +class OutlookIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): + config: SimpleOutlookConfig + file: DriveItem + + def __post_init__(self): + self._set_download_paths() + + def hash_mail_name(self, id): + """Outlook email ids are 152 char long. Hash to shorten to 16.""" + return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16] + + def _set_download_paths(self) -> None: + """Creates paths for downloading and parsing.""" + download_path = Path(f"{self.standard_config.download_dir}") + output_path = Path(f"{self.standard_config.output_dir}") + + self.download_dir = download_path + self.download_filepath = ( + download_path / f"{self.hash_mail_name(self.file.id)}.eml" + ).resolve() + oname = f"{self.hash_mail_name(self.file.id)}.eml.json" + self.output_dir = output_path + self.output_filepath = (output_path / oname).resolve() + + @property + def filename(self): + return Path(self.download_filepath).resolve() + + @property + def _output_filename(self): + return Path(self.output_filepath).resolve() + + @BaseIngestDoc.skip_if_file_exists + @requires_dependencies(["office365"]) + def get_file(self): + """Relies on Office365 python sdk message object to do the download.""" + try: + if not self.download_dir.is_dir(): + logger.debug(f"Creating directory: {self.download_dir}") + self.download_dir.mkdir(parents=True, exist_ok=True) + + with open( + os.path.join( + self.download_dir, + self.hash_mail_name(self.file.id) + ".eml", + ), + "wb", + ) as local_file: + self.file.download( + local_file, + ).execute_query() # download MIME representation of a message + + except Exception as e: + logger.error( + f"Error while downloading and saving file: {self.file.subject}.", + ) + logger.error(e) + return + logger.info(f"File downloaded: {self.file.subject}") + return + + +class OutlookConnector(ConnectorCleanupMixin, BaseConnector): + config: SimpleOutlookConfig + + def __init__( + self, + standard_config: StandardConnectorConfig, + config: SimpleOutlookConfig, + ): + super().__init__(standard_config, config) + self._set_client() + self.get_folder_ids() + + @requires_dependencies(["office365"]) + def _set_client(self): + from office365.graph_client import GraphClient + + self.client = GraphClient(self.config.token_factory) + + def initialize(self): + pass + + def recurse_folders(self, folder_id, main_folder_dict): + """We only get a count of subfolders for any folder. + Have to make additional calls to get subfolder ids.""" + subfolders = ( + self.client.users[self.config.user_email] + .mail_folders[folder_id] + .child_folders.get() + .execute_query() + ) + for subfolder in subfolders: + for k, v in main_folder_dict.items(): + if subfolder.get_property("parentFolderId") in v: + v.append(subfolder.id) + if subfolder.get_property("childFolderCount") > 0: + self.recurse_folders(subfolder.id, main_folder_dict) + + def get_folder_ids(self): + """Sets the mail folder ids and subfolder ids for requested root mail folders.""" + self.root_folders = defaultdict(list) + root_folders_with_subfolders = [] + get_root_folders = ( + self.client.users[self.config.user_email].mail_folders.get().execute_query() + ) + + for folder in get_root_folders: + self.root_folders[folder.display_name].append(folder.id) + if folder.get_property("childFolderCount") > 0: + root_folders_with_subfolders.append(folder.id) + + for folder in root_folders_with_subfolders: + self.recurse_folders(folder, self.root_folders) + + # Narrow down all mail folder ids (plus all subfolders) to the ones that were requested. + self.selected_folder_ids = list( + chain.from_iterable( + [ + v + for k, v in self.root_folders.items() + if k.lower() in [x.lower() for x in self.config.ms_outlook_folders] + ], + ), + ) + if not self.selected_folder_ids: + raise MissingFolderError( + f"There are no root folders with the names: {self.config.ms_outlook_folders}", + ) + + def get_ingest_docs(self): + """Returns a list of all the message objects that are in the requested root folder(s).""" + filtered_messages = [] + + # Get all the relevant messages in the selected folders/subfolders. + for folder_id in self.selected_folder_ids: + messages = ( + self.client.users[self.config.user_email] + .mail_folders[folder_id] + .messages.get() + .top(MAX_NUM_EMAILS) # Prevents the return from paging + .execute_query() + ) + # Skip empty list if there are no messages in folder. + if messages: + filtered_messages.append(messages) + + # Filtered messages have an un-downloadable resource path. + # So we get each message object individually. + individual_messages = [] + for m in list(chain.from_iterable(filtered_messages)): + messages = ( + self.client.users[self.config.user_email].messages[m.id].get().execute_query() + ) + individual_messages.append(messages) + + return [OutlookIngestDoc(self.standard_config, self.config, f) for f in individual_messages] diff --git a/unstructured/ingest/main.py b/unstructured/ingest/main.py index 4e1adf0c3..a023a6124 100755 --- a/unstructured/ingest/main.py +++ b/unstructured/ingest/main.py @@ -427,6 +427,17 @@ class MainProcess: default=None, help="Folder to start parsing files from.", ) +@click.option( + "--ms-user-email", + default=None, + help="Outlook email to download messages from.", +) +@click.option( + "--ms-outlook-folders", + default=None, + help="Comma separated list of folders to download email messages from. " + "Do not specify subfolders. Use quotes if spaces in folder names.", +) @click.option( "--elasticsearch-url", default=None, @@ -568,6 +579,8 @@ def main( ms_tenant, ms_user_pname, ms_onedrive_folder, + ms_user_email, + ms_outlook_folders, elasticsearch_url, elasticsearch_index_name, jq_query, @@ -681,6 +694,8 @@ def main( hashed_dir_name = hashlib.sha256( f"{ms_tenant}_{ms_user_pname}".encode("utf-8"), ) + elif ms_user_email: + hashed_dir_name = hashlib.sha256(ms_user_email.encode("utf-8")) elif confluence_url: hashed_dir_name = hashlib.sha256( f"{confluence_url}".encode("utf-8"), @@ -910,7 +925,7 @@ def main( decay=biomed_decay, ), ) - elif ms_client_id or ms_user_pname: + elif ms_client_id and ms_user_pname: from unstructured.ingest.connector.onedrive import ( OneDriveConnector, SimpleOneDriveConfig, @@ -929,6 +944,25 @@ def main( ), ) + elif ms_client_id and ms_user_email: + from unstructured.ingest.connector.outlook import ( + OutlookConnector, + SimpleOutlookConfig, + ) + + doc_connector = OutlookConnector( # type: ignore + standard_config=standard_config, + config=SimpleOutlookConfig( + client_id=ms_client_id, + client_credential=ms_client_cred, + user_email=ms_user_email, + tenant=ms_tenant, + authority_url=ms_authority_url, + ms_outlook_folders=SimpleOutlookConfig.parse_folders(ms_outlook_folders), + recursive=recursive, + ), + ) + elif local_input_path: from unstructured.ingest.connector.local import ( LocalConnector, diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 015f1a881..3d97ddae3 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -111,7 +111,7 @@ def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetada if email_date is not None: email_date = convert_to_iso_8601(email_date) - sent_from = header_dict.get("To") + sent_from = header_dict.get("From") if sent_from is not None: sent_from = [sender.strip() for sender in sent_from.split(",")]