mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 11:03:38 +00:00
feat: adds Outlook connector (#939)
* bonus: fixes issue with email partitioning where From field was being assigned the To field value.
This commit is contained in:
parent
d694cd53bf
commit
f7e46af22f
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -192,7 +192,10 @@ jobs:
|
||||
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
|
||||
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
|
||||
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
|
||||
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
|
||||
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
sudo apt-get update
|
||||
@ -212,6 +215,7 @@ jobs:
|
||||
make install-ingest-github
|
||||
make install-ingest-gitlab
|
||||
make install-ingest-onedrive
|
||||
make install-ingest-outlook
|
||||
make install-ingest-slack
|
||||
make install-ingest-wikipedia
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
||||
@ -67,6 +67,9 @@ jobs:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
|
||||
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
|
||||
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
|
||||
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
|
||||
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
|
||||
OVERWRITE_FIXTURES: "true"
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
@ -87,6 +90,7 @@ jobs:
|
||||
make install-ingest-github
|
||||
make install-ingest-gitlab
|
||||
make install-ingest-onedrive
|
||||
make install-ingest-outlook
|
||||
make install-ingest-slack
|
||||
make install-ingest-wikipedia
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
||||
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,15 @@
|
||||
## 0.8.3
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* Adds Outlook connector
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fixes issue with email partitioning where From field was being assigned the To field value.
|
||||
|
||||
## 0.8.2-dev7
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -12,3 +12,5 @@ include requirements/ingest-reddit.in
|
||||
include requirements/ingest-slack.in
|
||||
include requirements/ingest-wikipedia.in
|
||||
include requirements/ingest-google-drive.in
|
||||
include requirements/ingest-outlook.in
|
||||
include requirements/ingest-onedrive.in
|
||||
|
||||
4
Makefile
4
Makefile
@ -90,6 +90,10 @@ install-ingest-gitlab:
|
||||
install-ingest-onedrive:
|
||||
python3 -m pip install -r requirements/ingest-onedrive.txt
|
||||
|
||||
.PHONY: install-ingest-outlook
|
||||
install-ingest-outlook:
|
||||
python3 -m pip install -r requirements/ingest-outlook.txt
|
||||
|
||||
.PHONY: install-ingest-reddit
|
||||
install-ingest-reddit:
|
||||
python3 -m pip install -r requirements/ingest-reddit.txt
|
||||
|
||||
@ -25,7 +25,7 @@ NOTE: Keep in mind that you will need to have all the appropriate extras and dep
|
||||
--------------------
|
||||
You can batch process documents stored in your Azure Blob Container using the `Azure Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/azure.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/azure/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[azure]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[azure]"``
|
||||
|
||||
|
||||
``BioMed Connector``
|
||||
@ -37,49 +37,49 @@ You can process `National Center for Biotechnology Information <https://www.ncbi
|
||||
----------------------
|
||||
You can preprocess your Discord channel using the `Discord Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/discord.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/discord/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[discord]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[discord]"``
|
||||
|
||||
|
||||
``Dropbox Connector``
|
||||
----------------------
|
||||
You can batch process unstructured documents in your Dropbox by using the `Dropbox Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/dropbox.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/dropbox/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[dropbox]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[dropbox]"``
|
||||
|
||||
|
||||
``Elasticsearch Connector``
|
||||
----------------------------
|
||||
You can preprocess documents stored in Elasticsearch by using the `Elasticsearch Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/elasticsearch.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/elasticsearch/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[elasticsearch]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[elasticsearch]"``
|
||||
|
||||
|
||||
``Google Cloud Storage Connector``
|
||||
------------------
|
||||
You can batch load the files you have stored in Google Cloud Storage with the `GCS Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gcs.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_cloud_storage/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[gcs]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[gcs]"``
|
||||
|
||||
|
||||
``Github Connector``
|
||||
---------------------
|
||||
You can process files in a Github repository using the `Github Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/github.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/github/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[github]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[github]"``
|
||||
|
||||
|
||||
``Gitlab Connector``
|
||||
---------------------
|
||||
You can batch load files in a Gitlab repository using the `Gitlab Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gitlab.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/gitlab/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[gitlab]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[gitlab]"``
|
||||
|
||||
|
||||
``Google Drive Connector``
|
||||
---------------------
|
||||
You can batch process documents stored in your Google Drive with the `Google Drive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/google_drive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_drive/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[google-drive]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[google-drive]"``
|
||||
|
||||
|
||||
``Local Connector``
|
||||
@ -89,34 +89,42 @@ You can batch load your unstructured files in a local directory for preprocessin
|
||||
|
||||
``OneDrive Connector``
|
||||
---------------------
|
||||
You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/onedrive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/onedrive/onedrive.sh>`_.
|
||||
You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/onedrive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/onedrive/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[onedrive]"``
|
||||
|
||||
|
||||
``Outlook Connector``
|
||||
---------------------
|
||||
You can batch process email stored in Microsoft Outlook with the `Outlook Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/outlook.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/outlook/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[outlook]"``
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[onedrive]``
|
||||
|
||||
|
||||
``Reddit Connector``
|
||||
---------------------
|
||||
You can use the `Reddit Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/reddit.py>`_ to preprocess a Reddit thread. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/reddit/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[reddit]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[reddit]"``
|
||||
|
||||
|
||||
``S3 Connector``
|
||||
---------------------
|
||||
You can process your files stored in S3 in batch using the `S3 Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/s3.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/s3-small-batch/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[s3]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[s3]"``
|
||||
|
||||
|
||||
``Slack Connector``
|
||||
---------------------
|
||||
Using the `Slack Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ you can batch process a channel. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/slack/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[slack]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[slack]"``
|
||||
|
||||
|
||||
``Wikipedia Connector``
|
||||
---------------------
|
||||
You can load and process a Wikipedia page using the `Wikipedia Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ to preprocess for your model. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/wikipedia/ingest.sh>`_.
|
||||
|
||||
To install all dependencies for this connector run: ``pip install unstructured[wikipedia]``
|
||||
To install all dependencies for this connector run: ``pip install "unstructured[wikipedia]"``
|
||||
|
||||
@ -3,7 +3,7 @@ Date: Fri, 16 Dec 2022 17:04:16 -0500
|
||||
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
|
||||
Subject: Test Email
|
||||
From: Matthew Robinson <mrobinson@unstructured.io>
|
||||
To: Matthew Robinson <mrobinson@unstructured.io>
|
||||
To: NotMatthew <NotMatthew@notunstructured.com>
|
||||
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
||||
|
||||
--00000000000095c9b205eff92630
|
||||
|
||||
31
examples/ingest/outlook/ingest.sh
Executable file
31
examples/ingest/outlook/ingest.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Processes Outlook emails through Unstructured's library. Does not download attachments.
|
||||
|
||||
# Structured outputs are stored in outlook-output/
|
||||
|
||||
# NOTE, this script is not ready-to-run!
|
||||
# You must enter a Azure AD app client-id, client secret, tenant-id, and email
|
||||
# before running.
|
||||
|
||||
# To get the credentials for your Azure AD app, follow these steps:
|
||||
# https://learn.microsoft.com/en-us/graph/auth-register-app-v2
|
||||
# https://learn.microsoft.com/en-us/graph/auth-v2-service
|
||||
|
||||
# Assign the neccesary permissions for the application to read from mail.
|
||||
# https://learn.microsoft.com/en-us/graph/permissions-reference
|
||||
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--ms-client-id "$MS_CLIENT_ID" \
|
||||
--ms-client-cred "$MS_CLIENT_CRED" \
|
||||
--ms-tenant "$MS_TENANT_ID" \
|
||||
--ms-user-email "$MS_USER_EMAIL" \
|
||||
--ms-outlook-folders Inbox,"Sent Items" \
|
||||
--structured-output-dir outlook-output \
|
||||
--num-processes 2 \
|
||||
--recursive \
|
||||
--verbose
|
||||
5
requirements/ingest-outlook.in
Normal file
5
requirements/ingest-outlook.in
Normal file
@ -0,0 +1,5 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
msal
|
||||
Office365-REST-Python-Client
|
||||
cryptography==41.0.2
|
||||
55
requirements/ingest-outlook.txt
Normal file
55
requirements/ingest-outlook.txt
Normal file
@ -0,0 +1,55 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-outlook.in
|
||||
#
|
||||
certifi==2023.5.7
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cryptography
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -r requirements/ingest-outlook.in
|
||||
# msal
|
||||
# pyjwt
|
||||
idna==3.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
msal==1.22.0
|
||||
# via
|
||||
# -r requirements/ingest-outlook.in
|
||||
# office365-rest-python-client
|
||||
office365-rest-python-client==2.4.2
|
||||
# via -r requirements/ingest-outlook.in
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
pyjwt[crypto]==2.7.0
|
||||
# via msal
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# office365-rest-python-client
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# msal
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
2
setup.py
2
setup.py
@ -85,6 +85,8 @@ setup(
|
||||
"gcs": load_requirements("requirements/ingest-gcs.in"),
|
||||
"elasticsearch": load_requirements("requirements/ingest-elasticsearch.in"),
|
||||
"dropbox": load_requirements("requirements/ingest-dropbox.in"),
|
||||
"onedrive": load_requirements("requirements/ingest-onedrive.in"),
|
||||
"outlook": load_requirements("requirements/ingest-outlook.in"),
|
||||
"confluence": load_requirements("requirements/ingest-confluence.in"),
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
|
||||
@ -290,7 +290,7 @@ def test_partition_email_from_file_with_header():
|
||||
|
||||
|
||||
def test_partition_email_from_filename_has_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
||||
elements = partition_email(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert (
|
||||
@ -302,7 +302,7 @@ def test_partition_email_from_filename_has_metadata():
|
||||
page_number=None,
|
||||
url=None,
|
||||
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
|
||||
subject="Test Email",
|
||||
filetype="message/rfc822",
|
||||
).to_dict()
|
||||
@ -310,7 +310,7 @@ def test_partition_email_from_filename_has_metadata():
|
||||
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
|
||||
assert elements[0].metadata.get_date() == expected_dt
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "fake-email-header.eml"
|
||||
assert element.metadata.filename == "fake-email.eml"
|
||||
|
||||
|
||||
def test_extract_email_text_matches_html():
|
||||
|
||||
@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "a0f48ad299334e5716f85d225bfe2a16",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "21be155fb0c95885.eml",
|
||||
"date": "2023-07-15T08:35:51-07:00",
|
||||
"filetype": "message/rfc822",
|
||||
"sent_from": [
|
||||
"David Potter <potterdavidm@gmail.com>"
|
||||
],
|
||||
"sent_to": [
|
||||
"devops@unstructuredio.onmicrosoft.com"
|
||||
],
|
||||
"subject": "integration test email 1"
|
||||
},
|
||||
"text": "integration test email"
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "cebc4803f41f12981b808ffd79d7b480",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "497eba8c81c801c6.eml",
|
||||
"date": "2023-07-24T18:25:52-07:00",
|
||||
"filetype": "message/rfc822",
|
||||
"sent_from": [
|
||||
"Ryan Nikolaidis <ryan@unstructured.io>"
|
||||
],
|
||||
"sent_to": [
|
||||
"devops@unstructuredio.onmicrosoft.com"
|
||||
],
|
||||
"subject": "subfolder1_1"
|
||||
},
|
||||
"text": "this is a message for the subfolder1_1"
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "007ec3bff83ee17497e490b86a36e0dd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "4a16a411f162ebbb.eml",
|
||||
"date": "2023-07-09T20:38:47-07:00",
|
||||
"filetype": "message/rfc822",
|
||||
"sent_from": [
|
||||
"David Potter <potterdavidm@gmail.com>"
|
||||
],
|
||||
"sent_to": [
|
||||
"devops@unstructuredio.onmicrosoft.com"
|
||||
],
|
||||
"subject": "message for subfolder"
|
||||
},
|
||||
"text": "this is a message for the subfolder"
|
||||
}
|
||||
]
|
||||
@ -17,8 +17,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--ms-client-cred "$MS_CLIENT_CRED" \
|
||||
--ms-client-id "$MS_CLIENT_ID" \
|
||||
--ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \
|
||||
--ms-user-pname "devops@unstructuredio.onmicrosoft.com" \
|
||||
--ms-tenant "$MS_TENANT_ID" \
|
||||
--ms-user-pname "$MS_USER_PNAME" \
|
||||
--ms-onedrive-folder '/utic-test-ingest-fixtures' \
|
||||
--metadata-exclude file_directory,metadata.data_source.date_processed \
|
||||
--num-processes 2 \
|
||||
|
||||
30
test_unstructured_ingest/test-ingest-outlook.sh
Executable file
30
test_unstructured_ingest/test-ingest-outlook.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=outlook
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
|
||||
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--ms-client-cred "$MS_CLIENT_CRED" \
|
||||
--ms-client-id "$MS_CLIENT_ID" \
|
||||
--ms-tenant "$MS_TENANT_ID" \
|
||||
--ms-user-email "$MS_USER_EMAIL" \
|
||||
--ms-outlook-folders IntegrationTest \
|
||||
--metadata-exclude file_directory,metadata.data_source.date_processed \
|
||||
--num-processes 2 \
|
||||
--preserve-downloads \
|
||||
--recursive \
|
||||
--reprocess \
|
||||
--structured-output-dir "$OUTPUT_DIR"
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
@ -22,6 +22,7 @@ export OMP_THREAD_LIMIT=1
|
||||
./test_unstructured_ingest/test-ingest-against-api.sh
|
||||
./test_unstructured_ingest/test-ingest-gcs.sh
|
||||
./test_unstructured_ingest/test-ingest-onedrive.sh
|
||||
./test_unstructured_ingest/test-ingest-outlook.sh
|
||||
./test_unstructured_ingest/test-ingest-elasticsearch.sh
|
||||
./test_unstructured_ingest/test-ingest-confluence-diff.sh
|
||||
./test_unstructured_ingest/test-ingest-confluence-large.sh
|
||||
|
||||
115
test_unstructured_ingest/unit/test_paths.py
Normal file
115
test_unstructured_ingest/unit/test_paths.py
Normal file
@ -0,0 +1,115 @@
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from unstructured.ingest.connector.dropbox import (
|
||||
DropboxIngestDoc,
|
||||
)
|
||||
from unstructured.ingest.connector.fsspec import (
|
||||
FsspecIngestDoc,
|
||||
)
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
StandardConnectorConfig,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FakeConfigDropboxRoot:
|
||||
output_dir = "/fakeuser/fake_output"
|
||||
dir_path = " "
|
||||
download_dir = "/fakeuser/fake_download"
|
||||
|
||||
|
||||
@dataclass
|
||||
class FakeConfigFolder:
|
||||
output_dir = "/fakeuser/fake_output"
|
||||
dir_path = "fake_folder"
|
||||
download_dir = "/fakeuser/fake_download"
|
||||
|
||||
|
||||
def test_dropbox_root_succeeds():
|
||||
"""Test that path joining method works for Dropbox root folder. Note slash in front of remote_file_path."""
|
||||
dbox = DropboxIngestDoc(
|
||||
config=FakeConfigDropboxRoot,
|
||||
standard_config=FakeConfigDropboxRoot,
|
||||
remote_file_path="/fake_file.txt",
|
||||
)
|
||||
output_filename = dbox._output_filename
|
||||
download_filename = dbox._tmp_download_file()
|
||||
|
||||
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
|
||||
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
|
||||
|
||||
|
||||
def test_dropbox_root_succeeds2():
|
||||
"""Test that path joining method works for Dropbox root folder. Note lack of slash in front of remote_file_path.
|
||||
This still works."""
|
||||
dbox = DropboxIngestDoc(
|
||||
config=FakeConfigDropboxRoot,
|
||||
standard_config=FakeConfigDropboxRoot,
|
||||
remote_file_path="fake_file.txt",
|
||||
)
|
||||
output_filename = dbox._output_filename
|
||||
download_filename = dbox._tmp_download_file()
|
||||
|
||||
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
|
||||
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
|
||||
|
||||
|
||||
def test_dropbox_folder_succeeds():
|
||||
"""Test that path joining method works for Dropbox root folder. Note no slash in front of remote_file_path."""
|
||||
dbox = DropboxIngestDoc(
|
||||
config=FakeConfigFolder,
|
||||
standard_config=FakeConfigFolder,
|
||||
remote_file_path="fake_file2.txt",
|
||||
)
|
||||
output_filename = dbox._output_filename
|
||||
download_filename = dbox._tmp_download_file()
|
||||
|
||||
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
|
||||
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
|
||||
|
||||
|
||||
def test_dropbox_folder_fails():
|
||||
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
|
||||
Path joining is sensitive. Note that the path is MISSING the folders."""
|
||||
dbox = DropboxIngestDoc(
|
||||
config=FakeConfigFolder,
|
||||
standard_config=FakeConfigFolder,
|
||||
remote_file_path="/fake_file2.txt",
|
||||
)
|
||||
output_filename = dbox._output_filename
|
||||
download_filename = dbox._tmp_download_file()
|
||||
|
||||
assert output_filename == Path("/fake_file2.txt.json")
|
||||
assert download_filename == Path("/fake_file2.txt")
|
||||
|
||||
|
||||
def test_fsspec_folder_succeeds():
|
||||
"""Test that path joining method works for root folder. Note no slash in front of remote_file_path."""
|
||||
dbox = FsspecIngestDoc(
|
||||
config=FakeConfigFolder,
|
||||
standard_config=FakeConfigFolder,
|
||||
remote_file_path="fake_file2.txt",
|
||||
)
|
||||
output_filename = dbox._output_filename
|
||||
download_filename = dbox._tmp_download_file()
|
||||
|
||||
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
|
||||
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
|
||||
|
||||
|
||||
def test_fsspec_folder_fails():
|
||||
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
|
||||
Path joining is sensitive. Note that the path is MISSING the folders."""
|
||||
fstest = FsspecIngestDoc(
|
||||
config=FakeConfigFolder,
|
||||
standard_config=FakeConfigFolder,
|
||||
remote_file_path="/fake_file2.txt",
|
||||
)
|
||||
output_filename = fstest._output_filename
|
||||
download_filename = fstest._tmp_download_file()
|
||||
|
||||
assert output_filename == Path("/fake_file2.txt.json")
|
||||
assert download_filename == Path("/fake_file2.txt")
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.2-dev7" # pragma: no cover
|
||||
__version__ = "0.8.3" # pragma: no cover
|
||||
|
||||
@ -166,9 +166,9 @@ class FsspecConnector(ConnectorCleanupMixin, BaseConnector):
|
||||
def get_ingest_docs(self):
|
||||
return [
|
||||
self.ingest_doc_cls(
|
||||
self.standard_config,
|
||||
self.config,
|
||||
file,
|
||||
standard_config=self.standard_config,
|
||||
config=self.config,
|
||||
remote_file_path=file,
|
||||
)
|
||||
for file in self._list_files()
|
||||
]
|
||||
|
||||
230
unstructured/ingest/connector/outlook.py
Normal file
230
unstructured/ingest/connector/outlook.py
Normal file
@ -0,0 +1,230 @@
|
||||
import hashlib
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
ConnectorCleanupMixin,
|
||||
IngestDocCleanupMixin,
|
||||
StandardConnectorConfig,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder
|
||||
|
||||
|
||||
class MissingFolderError(Exception):
|
||||
"""There are no root folders with those names."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleOutlookConfig(BaseConnectorConfig):
|
||||
"""This class is getting the token."""
|
||||
|
||||
client_id: str
|
||||
client_credential: str = field(repr=False)
|
||||
user_email: str
|
||||
tenant: str = field(repr=False)
|
||||
authority_url: str = field(repr=False)
|
||||
ms_outlook_folders: List[str]
|
||||
recursive: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if not (self.client_id and self.client_credential and self.user_email):
|
||||
raise ValueError(
|
||||
"Please provide one of the following mandatory values:"
|
||||
"\n--ms-client_id\n--ms-client_cred\n--ms-user-email",
|
||||
)
|
||||
self.token_factory = self._acquire_token
|
||||
|
||||
@requires_dependencies(["msal"])
|
||||
def _acquire_token(self):
|
||||
from msal import ConfidentialClientApplication
|
||||
|
||||
try:
|
||||
app = ConfidentialClientApplication(
|
||||
authority=f"{self.authority_url}/{self.tenant}",
|
||||
client_id=self.client_id,
|
||||
client_credential=self.client_credential,
|
||||
)
|
||||
token = app.acquire_token_for_client(
|
||||
scopes=["https://graph.microsoft.com/.default"],
|
||||
)
|
||||
except ValueError as exc:
|
||||
logger.error("Couldn't set up credentials for Outlook")
|
||||
raise exc
|
||||
return token
|
||||
|
||||
@staticmethod
|
||||
def parse_folders(folder_str: str) -> List[str]:
|
||||
"""Parses a comma separated string of Outlook folders into a list."""
|
||||
return [x.strip() for x in folder_str.split(",")]
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutlookIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
config: SimpleOutlookConfig
|
||||
file: DriveItem
|
||||
|
||||
def __post_init__(self):
|
||||
self._set_download_paths()
|
||||
|
||||
def hash_mail_name(self, id):
|
||||
"""Outlook email ids are 152 char long. Hash to shorten to 16."""
|
||||
return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
def _set_download_paths(self) -> None:
|
||||
"""Creates paths for downloading and parsing."""
|
||||
download_path = Path(f"{self.standard_config.download_dir}")
|
||||
output_path = Path(f"{self.standard_config.output_dir}")
|
||||
|
||||
self.download_dir = download_path
|
||||
self.download_filepath = (
|
||||
download_path / f"{self.hash_mail_name(self.file.id)}.eml"
|
||||
).resolve()
|
||||
oname = f"{self.hash_mail_name(self.file.id)}.eml.json"
|
||||
self.output_dir = output_path
|
||||
self.output_filepath = (output_path / oname).resolve()
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return Path(self.download_filepath).resolve()
|
||||
|
||||
@property
|
||||
def _output_filename(self):
|
||||
return Path(self.output_filepath).resolve()
|
||||
|
||||
@BaseIngestDoc.skip_if_file_exists
|
||||
@requires_dependencies(["office365"])
|
||||
def get_file(self):
|
||||
"""Relies on Office365 python sdk message object to do the download."""
|
||||
try:
|
||||
if not self.download_dir.is_dir():
|
||||
logger.debug(f"Creating directory: {self.download_dir}")
|
||||
self.download_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(
|
||||
os.path.join(
|
||||
self.download_dir,
|
||||
self.hash_mail_name(self.file.id) + ".eml",
|
||||
),
|
||||
"wb",
|
||||
) as local_file:
|
||||
self.file.download(
|
||||
local_file,
|
||||
).execute_query() # download MIME representation of a message
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error while downloading and saving file: {self.file.subject}.",
|
||||
)
|
||||
logger.error(e)
|
||||
return
|
||||
logger.info(f"File downloaded: {self.file.subject}")
|
||||
return
|
||||
|
||||
|
||||
class OutlookConnector(ConnectorCleanupMixin, BaseConnector):
|
||||
config: SimpleOutlookConfig
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
standard_config: StandardConnectorConfig,
|
||||
config: SimpleOutlookConfig,
|
||||
):
|
||||
super().__init__(standard_config, config)
|
||||
self._set_client()
|
||||
self.get_folder_ids()
|
||||
|
||||
@requires_dependencies(["office365"])
|
||||
def _set_client(self):
|
||||
from office365.graph_client import GraphClient
|
||||
|
||||
self.client = GraphClient(self.config.token_factory)
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
def recurse_folders(self, folder_id, main_folder_dict):
|
||||
"""We only get a count of subfolders for any folder.
|
||||
Have to make additional calls to get subfolder ids."""
|
||||
subfolders = (
|
||||
self.client.users[self.config.user_email]
|
||||
.mail_folders[folder_id]
|
||||
.child_folders.get()
|
||||
.execute_query()
|
||||
)
|
||||
for subfolder in subfolders:
|
||||
for k, v in main_folder_dict.items():
|
||||
if subfolder.get_property("parentFolderId") in v:
|
||||
v.append(subfolder.id)
|
||||
if subfolder.get_property("childFolderCount") > 0:
|
||||
self.recurse_folders(subfolder.id, main_folder_dict)
|
||||
|
||||
def get_folder_ids(self):
|
||||
"""Sets the mail folder ids and subfolder ids for requested root mail folders."""
|
||||
self.root_folders = defaultdict(list)
|
||||
root_folders_with_subfolders = []
|
||||
get_root_folders = (
|
||||
self.client.users[self.config.user_email].mail_folders.get().execute_query()
|
||||
)
|
||||
|
||||
for folder in get_root_folders:
|
||||
self.root_folders[folder.display_name].append(folder.id)
|
||||
if folder.get_property("childFolderCount") > 0:
|
||||
root_folders_with_subfolders.append(folder.id)
|
||||
|
||||
for folder in root_folders_with_subfolders:
|
||||
self.recurse_folders(folder, self.root_folders)
|
||||
|
||||
# Narrow down all mail folder ids (plus all subfolders) to the ones that were requested.
|
||||
self.selected_folder_ids = list(
|
||||
chain.from_iterable(
|
||||
[
|
||||
v
|
||||
for k, v in self.root_folders.items()
|
||||
if k.lower() in [x.lower() for x in self.config.ms_outlook_folders]
|
||||
],
|
||||
),
|
||||
)
|
||||
if not self.selected_folder_ids:
|
||||
raise MissingFolderError(
|
||||
f"There are no root folders with the names: {self.config.ms_outlook_folders}",
|
||||
)
|
||||
|
||||
def get_ingest_docs(self):
|
||||
"""Returns a list of all the message objects that are in the requested root folder(s)."""
|
||||
filtered_messages = []
|
||||
|
||||
# Get all the relevant messages in the selected folders/subfolders.
|
||||
for folder_id in self.selected_folder_ids:
|
||||
messages = (
|
||||
self.client.users[self.config.user_email]
|
||||
.mail_folders[folder_id]
|
||||
.messages.get()
|
||||
.top(MAX_NUM_EMAILS) # Prevents the return from paging
|
||||
.execute_query()
|
||||
)
|
||||
# Skip empty list if there are no messages in folder.
|
||||
if messages:
|
||||
filtered_messages.append(messages)
|
||||
|
||||
# Filtered messages have an un-downloadable resource path.
|
||||
# So we get each message object individually.
|
||||
individual_messages = []
|
||||
for m in list(chain.from_iterable(filtered_messages)):
|
||||
messages = (
|
||||
self.client.users[self.config.user_email].messages[m.id].get().execute_query()
|
||||
)
|
||||
individual_messages.append(messages)
|
||||
|
||||
return [OutlookIngestDoc(self.standard_config, self.config, f) for f in individual_messages]
|
||||
@ -427,6 +427,17 @@ class MainProcess:
|
||||
default=None,
|
||||
help="Folder to start parsing files from.",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-user-email",
|
||||
default=None,
|
||||
help="Outlook email to download messages from.",
|
||||
)
|
||||
@click.option(
|
||||
"--ms-outlook-folders",
|
||||
default=None,
|
||||
help="Comma separated list of folders to download email messages from. "
|
||||
"Do not specify subfolders. Use quotes if spaces in folder names.",
|
||||
)
|
||||
@click.option(
|
||||
"--elasticsearch-url",
|
||||
default=None,
|
||||
@ -568,6 +579,8 @@ def main(
|
||||
ms_tenant,
|
||||
ms_user_pname,
|
||||
ms_onedrive_folder,
|
||||
ms_user_email,
|
||||
ms_outlook_folders,
|
||||
elasticsearch_url,
|
||||
elasticsearch_index_name,
|
||||
jq_query,
|
||||
@ -681,6 +694,8 @@ def main(
|
||||
hashed_dir_name = hashlib.sha256(
|
||||
f"{ms_tenant}_{ms_user_pname}".encode("utf-8"),
|
||||
)
|
||||
elif ms_user_email:
|
||||
hashed_dir_name = hashlib.sha256(ms_user_email.encode("utf-8"))
|
||||
elif confluence_url:
|
||||
hashed_dir_name = hashlib.sha256(
|
||||
f"{confluence_url}".encode("utf-8"),
|
||||
@ -910,7 +925,7 @@ def main(
|
||||
decay=biomed_decay,
|
||||
),
|
||||
)
|
||||
elif ms_client_id or ms_user_pname:
|
||||
elif ms_client_id and ms_user_pname:
|
||||
from unstructured.ingest.connector.onedrive import (
|
||||
OneDriveConnector,
|
||||
SimpleOneDriveConfig,
|
||||
@ -929,6 +944,25 @@ def main(
|
||||
),
|
||||
)
|
||||
|
||||
elif ms_client_id and ms_user_email:
|
||||
from unstructured.ingest.connector.outlook import (
|
||||
OutlookConnector,
|
||||
SimpleOutlookConfig,
|
||||
)
|
||||
|
||||
doc_connector = OutlookConnector( # type: ignore
|
||||
standard_config=standard_config,
|
||||
config=SimpleOutlookConfig(
|
||||
client_id=ms_client_id,
|
||||
client_credential=ms_client_cred,
|
||||
user_email=ms_user_email,
|
||||
tenant=ms_tenant,
|
||||
authority_url=ms_authority_url,
|
||||
ms_outlook_folders=SimpleOutlookConfig.parse_folders(ms_outlook_folders),
|
||||
recursive=recursive,
|
||||
),
|
||||
)
|
||||
|
||||
elif local_input_path:
|
||||
from unstructured.ingest.connector.local import (
|
||||
LocalConnector,
|
||||
|
||||
@ -111,7 +111,7 @@ def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetada
|
||||
if email_date is not None:
|
||||
email_date = convert_to_iso_8601(email_date)
|
||||
|
||||
sent_from = header_dict.get("To")
|
||||
sent_from = header_dict.get("From")
|
||||
if sent_from is not None:
|
||||
sent_from = [sender.strip() for sender in sent_from.split(",")]
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user